transformers 库基础组件

参考来源:B站-手把手带你实战HuggingFace Transformers-入门篇
官方文档:HuggingFace-transformers

pipeline

查看支持任务

1
2
3
4
5
6
from transformers.pipelines import SUPPORTED_TASKS
for k, v in SUPPORTED_TASKS.items():
print(k, v)

pipe = pipeline("text-classification", model="uer/roberta-base-finetuned-dianping-chinese")
pipe("我觉得不太行!")

背后实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
def my_pipeline():
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
model = AutoModelForSequenceClassification.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")

input_text = "我觉得不太行!"
inputs = tokenizer(input_text, return_tensors="pt")
res = model(**inputs)

logits = res.logits
logits = torch.softmax(logits, dim=-1)
pred = torch.argmax(logits).item()

result = model.config.id2label.get(pred)
return result

tokenizer

属性查看

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from transformers import AutoTokenizer

# 从HuggingFace加载,输入模型名称,即可加载对于的分词器
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
# tokenizer 保存到本地
tokenizer.save_pretrained("./roberta_tokenizer")

# 从本地加载tokenizer
tokenizer = AutoTokenizer.from_pretrained("./roberta_tokenizer/")
sen = "弱小的我也有大梦想!"

tokens = tokenizer.tokenize(sen) # 句子分词
tokenizer.vocab # 词典
tokenizer.vocab_size

# 词序列与id序列相互转化
ids = tokenizer.convert_tokens_to_ids(tokens)
tokens = tokenizer.convert_ids_to_tokens(ids)

# 将token序列转换为string
str_sen = tokenizer.convert_tokens_to_string(tokens)

# 将字符串转换为id序列,又称之为编码
ids = tokenizer.encode(sen, add_special_tokens=True)
# 将id序列转换为字符串,又称之为解码
str_sen = tokenizer.decode(ids, skip_special_tokens=False)

# 填充
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
# 截断
ids = tokenizer.encode(sen, max_length=5, truncation=True)

# 全流程
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
attention_mask = [1 if idx != 0 else 0 for idx in ids]
token_type_ids = [0] * len(ids)
ids, attention_mask, token_type_ids

使用方法

1
2
3
4
5
6
7
8
9
# 单个句子
sen = "弱小的我也有大梦想!"
inputs = tokenizer(sen, padding="max_length", max_length=15)

# 多个句子
sens = ["弱小的我也有大梦想",
"有梦想谁都了不起",
"追逐梦想的心,比梦想本身,更可贵"]
res = tokenizer(sens)

细节问题

  1. 存在fast/slow tokenizer:
    • 默认是 fast,基于Rust实现,速度快;有 return_offsets_mappingword_ids
    • 可以指定use_fast=False,基于python实现,速度慢。
1
2
3
4
5
6
7
# fast
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
inputs = fast_tokenizer(sen, return_offsets_mapping=True)
inputs.word_ids()

# slow
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)
  1. 特殊情况有自定义的代码,则需要开启信任远程代码trust_remote_code=True
1
2
tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-13B-base", trust_remote_code=True)
tokenizer.decode(tokenizer.encode(sen))

model

  • 分类:
    1. 编码器模型:自编码,Encoder,双向注意力机制,每个词看上下文。(阅读理解,文本分类,意图识别)
    2. 解码器模型:自回归,Decoder,单向注意力机制,只能看前文。(文本生成)
    3. 编码器解码器模型:序列到序列,Encoder双向+Decoder单向。(文本摘要,机器翻译)

model head

  • Model Head 是连接在模型后的层,通常为一个或多个全连接层,它将模型的编码表示进行映射,以解决不同的任务类型。
1
!git lfs clone "https://huggingface.co/hfl/rbt3" --include="*.bin"
1
2
3
4
5
6
7
8
9
10
11
12
13
from transformers import AutoConfig, AutoModel, AutoTokenizer, AutoModelForSequenceClassification

sen = "弱小的我也有大梦想!"
tokenizer = AutoTokenizer.from_pretrained("rbt3")
inputs = tokenizer(sen, return_tensors="pt")

# 1. 不带 model head 的调用
model = AutoModel.from_pretrained("rbt3", output_attentions=True) # model.config
output = model(**inputs)

# 2. 带 model head 的调用
clz_model = AutoModelForSequenceClassification.from_pretrained("rbt3", num_labels=10)
clz_output = clz_model(**inputs) # clz_model.config.num_labels 配置为 10 了

demo

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split, DataLoader
from torch.optim import Adam

# 数据集、数据加载器(含预处理)
class MyDataset(Dataset):
def __init__(self, data_path) -> None:
super().__init__()
self.data = pd.read_csv(data_path)
self.data = self.data.dropna()

def __getitem__(self, index):
return self.data.iloc[index]["review"], self.data.iloc[index]["label"]

def __len__(self):
return len(self.data)

dataset = MyDataset("./ChnSentiCorp_htl_all.csv")
trainset, validset = random_split(dataset, lengths=[0.9, 0.1])

tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def collate_func(batch):
texts, labels = [], []
for item in batch:
texts.append(item[0])
labels.append(item[1])
inputs = tokenizer(texts, max_length=128, padding="max_length", truncation=True, return_tensors="pt")
inputs["labels"] = torch.tensor(labels)
return inputs

trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=collate_func)
validloader = DataLoader(validset, batch_size=64, shuffle=False, collate_fn=collate_func)

# 模型
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")
if torch.cuda.is_available():
model = model.cuda()

# 优化器
optimizer = Adam(model.parameters(), lr=2e-5)


# 训练与验证
def evaluate():
model.eval()
acc_num = 0
with torch.inference_mode():
for batch in validloader:
if torch.cuda.is_available():
batch = {k: v.cuda() for k, v in batch.items()}
output = model(**batch)
pred = torch.argmax(output.logits, dim=-1)
acc_num += (pred.long() == batch["labels"].long()).float().sum()
return acc_num / len(validset)

def train(epoch=3, log_step=100):
global_step = 0
for ep in range(epoch):
model.train()
for batch in trainloader:
if torch.cuda.is_available():
batch = {k: v.cuda() for k, v in batch.items()}
optimizer.zero_grad()
output = model(**batch)
output.loss.backward()
optimizer.step()
if global_step % log_step == 0:
print(f"ep: {ep}, global_step: {global_step}, loss: {output.loss.item()}")
global_step += 1
acc = evaluate()
print(f"ep: {ep}, acc: {acc}")


# 执行训练
train()

训练完毕后,进行随机测试:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
sen = "我觉得这家酒店不错,饭很好吃!"
id2_label = {0: "差评!", 1: "好评!"}

# 方法一
model.eval()
with torch.inference_mode():
inputs = tokenizer(sen, return_tensors="pt")
inputs = {k: v.cuda() for k, v in inputs.items()}
logits = model(**inputs).logits
pred = torch.argmax(logits, dim=-1)
print(f"输入:{sen}\n模型预测结果:{id2_label.get(pred.item())}")

# 方法二
from transformers import pipeline
model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
print(pipe(sen))

Datasets

加载在线数据集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from datasets import *

# 加载
datasets = load_dataset("madao33/new-title-chinese") # 数据集
boolq_dataset = load_dataset("super_glue", "boolq") # 数据集合集

# 只加载特定部分
dataset = load_dataset("madao33/new-title-chinese", split="train")
dataset = load_dataset("madao33/new-title-chinese", split="train[10:100]")
dataset = load_dataset("madao33/new-title-chinese", split="train[:50%]")
dataset = load_dataset("madao33/new-title-chinese", split=["train[:50%]", "train[50%:]"])

# 查看
datasets["train"][0]
datasets["train"][:2]
datasets["train"]["title"][:5]
datasets["train"].column_names
datasets["train"].features

# 数据集划分
dataset = datasets["train"]
dataset.train_test_split(test_size=0.1)

dataset = boolq_dataset["train"]
dataset.train_test_split(test_size=0.1, stratify_by_column="label") # 分类数据集可以按照比例划分

# 选取
datasets["train"].select([0, 1])
# 过滤
filter_dataset = datasets["train"].filter(lambda example: "中国" in example["title"])
# 映射
def add_prefix(example):
example["title"] = 'Prefix: ' + example["title"]
return example
prefix_dataset = datasets.map(add_prefix)

# 一般应用
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
def preprocess_function(example, tokenizer=tokenizer):
model_inputs = tokenizer(example["content"], max_length=512, truncation=True)
labels = tokenizer(example["title"], max_length=32, truncation=True)
# label就是title编码的结果
model_inputs["labels"] = labels["input_ids"]
return model_inputs

processed_datasets = datasets.map(preprocess_function, remove_columns=datasets["train"].column_names)
# 可以添加参数优化加速:多线程 num_proc=4 或 批优化 batched=True


# 保存与加载
processed_datasets.save_to_disk("./processed_data")
processed_datasets = load_from_disk("./processed_data")

加载本地数据集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 直接加载1
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
# 直接加载2
dataset = Dataset.from_csv("./ChnSentiCorp_htl_all.csv")

# 加载多个文件 1
dataset = load_dataset("csv", data_files="./all_data/", split='train') # 文件夹内全部csv
# 加载多个文件 2
dataset = load_dataset("csv", data_files=["./all_data/ChnSentiCorp_htl_all.csv", "./all_data/ChnSentiCorp_htl_all copy.csv"], split='train') # 列表指定文件

# 从其他格式加载 1
import pandas as pd
data = pd.read_csv("./ChnSentiCorp_htl_all.csv")
dataset = Dataset.from_pandas(data)
# 从其他格式加载 2
# List格式的数据需要内嵌{},明确数据字段。不能是['abc', 'def']
data = [{"text": "abc"}, {"text": "def"}]
Dataset.from_list(data)

# 自定义脚本处理数据加载
dataset = load_dataset("./load_script.py", split="train")
# load_script.py 需要有 __info__、__split__generators、__generate__examples

DataCollator

注意:DataCollatorWithPadding只能帮忙处理 input_idstoken_type_idsattention_masklabels这四个字段,若数据集本身含其他字段,则需要自行处理 padding。

1
2
3
4
5
6
7
8
9
10
11
12
13
from transformers import  DataCollatorWithPadding

dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split='train')
dataset = dataset.filter(lambda x: x["review"] is not None)

def process_function(examples):
tokenized_examples = tokenizer(examples["review"], max_length=128, truncation=True)
tokenized_examples["labels"] = examples["label"]
return tokenized_examples

tokenized_dataset = dataset.map(process_function, batched=True, remove_columns=dataset.column_names)
collator = DataCollatorWithPadding(tokenizer=tokenizer)
dl = DataLoader(tokenized_dataset, batch_size=4, collate_fn=collator, shuffle=True)

Evaluate

清单与说明

1
2
3
4
5
6
7
8
9
10
11
import evaluate
# 在2024-01-11的测试中,list_evaluation_modules无法完全显示支持的评估函数,但不影响使用
# 完成的评估函数可以在 https://huggingface.co/evaluate-metric 中查看
evaluate.list_evaluation_modules(include_community=False, with_details=True)

# 示例
accuracy = evaluate.load("accuracy")
# 查看使用说明
print(accuracy.description)
print(accuracy.inputs_description)
print(accuracy)

计算使用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# 全局计算
accuracy = evaluate.load("accuracy")
results = accuracy.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
# {'accuracy': 0.5}

# 迭代计算 1 多 list
accuracy = evaluate.load("accuracy")
for ref, pred in zip([0,1,0,1], [1,0,0,1]):
accuracy.add(references=ref, predictions=pred)
accuracy.compute()

# 迭代计算 2 多 batch
accuracy = evaluate.load("accuracy")
for refs, preds in zip([[0,1],[0,1]], [[1,0],[0,1]]):
accuracy.add_batch(references=refs, predictions=preds)
accuracy.compute()


# 多个评估指标
clf_metrics = evaluate.combine(["accuracy", "f1", "recall", "precision"])
clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])
# {'accuracy': 0.6666666666666666,
# 'f1': 0.6666666666666666,
# 'recall': 0.5,
# 'precision': 1.0}


# 可视化结果
from evaluate.visualization import radar_plot # 目前只支持雷达图
data = [
{"accuracy": 0.99, "precision": 0.8, "f1": 0.95, "latency_in_seconds": 33.6},
{"accuracy": 0.98, "precision": 0.87, "f1": 0.91, "latency_in_seconds": 11.2},
{"accuracy": 0.98, "precision": 0.78, "f1": 0.88, "latency_in_seconds": 87.6},
{"accuracy": 0.88, "precision": 0.78, "f1": 0.81, "latency_in_seconds": 101.6}
]
model_names = ["Model 1", "Model 2", "Model 3", "Model 4"]
plot = radar_plot(data=data, model_names=model_names)

Trainer

更新 demo 版本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset
import torch
import evaluate

# 1. 加载数据集、划分
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
datasets = dataset.train_test_split(test_size=0.1)

# 2. 数据预处理
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")
def process_function(examples):
tokenized_examples = tokenizer(examples["review"], max_length=128, truncation=True)
tokenized_examples["labels"] = examples["label"]
return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)

# 3. 创建模型
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")

# 4. 创建评估函数
acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
def eval_metric(eval_predict):
predictions, labels = eval_predict
predictions = predictions.argmax(axis=-1)
acc = acc_metric.compute(predictions=predictions, references=labels)
f1 = f1_metric.compute(predictions=predictions, references=labels)
acc.update(f1)
return acc

# 5. 创建 Trainer
train_args = TrainingArguments(output_dir="./checkpoints", # 输出文件夹
per_device_train_batch_size=64, # 训练时的batch_size
per_device_eval_batch_size=128, # 验证时的batch_size
logging_steps=10, # log 打印的频率
evaluation_strategy="epoch", # 评估策略
save_strategy="epoch", # 保存策略
save_total_limit=3, # 最大保存数
learning_rate=2e-5, # 学习率
weight_decay=0.01, # weight_decay
metric_for_best_model="f1", # 设定评估指标
load_best_model_at_end=True) # 训练完成后加载最优模型
trainer = Trainer(model=model,
args=train_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
compute_metrics=eval_metric)

# 6. 模型训练
trainer.train()

# 7. 模型评估
trainer.evaluate(tokenized_datasets["test"])

# 8. 模型预测
trainer.predict(tokenized_datasets["test"])

from transformers import pipeline
id2_label = id2_label = {0: "差评!", 1: "好评!"}
model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
sen = "我觉得不错!"
pipe(sen)