阅读书籍: 《大模型推荐系统 | 算法原理、代码实战与案例分析》刘强
数据集来源:

注:大型文件包工具git-lfs、强化学习训练工具trl

预训练范式

通过大模型预训练进行推荐

1、预训练的一般思路与方法

1.1 预训练数据的准备

  • 直接利用用户行为序列
  • 将用户行为序列 ID 转为相关文本
  • 在用户行为序列中整合用户的特征

1.2 大模型架构的选择

  • BERT4Rec
  • PTUM
  • P5
  • M6-Rec

1.3 大模型预训练

  • Transformers 库
  • 苹果 MLX 框架

1.4 大模型推理

  • Transformers 库
  • llama.cpp推理

2、案例

2.1 基于 PTUM 架构的预训练推荐系统

PTUM : Pre-Training User Model

  1. 掩码行为预测
  2. 后续行为预测

2.2 基于 P5 的预训练推荐系统

P5: Pretrain, Personalized Prompt, and Predict Paradigm

  • 预训练数据准备
    1. 评分、评论、解释
    2. 序列推荐
    3. 直接推荐
  • P5 模型架构
  • P5 预训练
  • P5 推理

3、基于 MIND 数据集的代码实战

3.1 预训练数据集准备

(1)生成预训练的序列数据
文件generate_user_sequence.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import csv

user_sequence_list = []
with open('../data/mind/behaviors.tsv', 'r') as file:
reader = csv.reader(file, delimiter='\t')
for row in reader:
uid = row[1]
history = row[3]
if len(history.split(" ")) >= 5: # 用户至少要有5个点击历史
r = uid + " " + history
user_sequence_list.append(r)

user_sequence_path = "../data/mind/user_sequence.txt"
with open(user_sequence_path, 'a') as file:
for r in user_sequence_list:
file.write(r + "\n")

(2) 生成预训练数据
文件generate_dataset_train.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import json
import os
import sys

import fire

sys.path.append('../')
from utils import sequential_indexing, load_prompt_template, check_task_prompt, get_info_from_prompt
from utils import construct_user_sequence_dict, read_line


def main(data_path: str, item_indexing: str, task: str, dataset: str, prompt_file: str, sequential_order: str,
max_his: int, his_sep: str, his_prefix: int, skip_empty_his: int):
file_data = dict()
file_data['arguments'] = {
"data_path": data_path, "item_indexing": item_indexing, "task": task,
"dataset": dataset, "prompt_file": prompt_file, "sequential_order": sequential_order,
"max_his": max_his, "his_sep": his_sep, "his_prefix": his_prefix, "skip_empty_his": skip_empty_his
}
file_data['data'] = []
tasks = list(task)
user_sequence = read_line(os.path.join(data_path, dataset, 'user_sequence.txt'))
user_sequence_dict = construct_user_sequence_dict(user_sequence)

reindex_user_seq_dict, item_map = sequential_indexing(data_path, dataset,
user_sequence_dict, sequential_order)

# get prompt
prompt = load_prompt_template(prompt_file, tasks)
info = get_info_from_prompt(prompt)
check_task_prompt(prompt, tasks)

# Load training data samples
training_data_samples = []
for user in reindex_user_seq_dict:
items = reindex_user_seq_dict[user][:-2]
for i in range(len(items)):
if i == 0:
if skip_empty_his > 0:
continue
one_sample = dict()
one_sample['dataset'] = dataset
one_sample['user_id'] = user
if his_prefix > 0:
one_sample['target'] = 'item_' + items[i]
else:
one_sample['target'] = items[i]
if 'history' in info:
history = items[:i]
if max_his > 0:
history = history[-max_his:]
if his_prefix > 0:
one_sample['history'] = his_sep.join(["item_" + item_idx for item_idx in history])
else:
one_sample['history'] = his_sep.join(history)
training_data_samples.append(one_sample)
print("load training data")
print(f'there are {len(training_data_samples)} samples in training data.')

# construct sentences
for i in range(len(training_data_samples)):
one_sample = training_data_samples[i]
for t in tasks:
datapoint = {'task': dataset + t, 'data_id': i}
for pid in prompt[t]['seen']:
datapoint['instruction'] = prompt[t]['seen'][pid]['Input']
datapoint['input'] = prompt[t]['seen'][pid]['Input'].format(**one_sample)
datapoint['output'] = prompt[t]['seen'][pid]['Output'].format(**one_sample)
file_data['data'].append(datapoint.copy())

print("data constructed")
print(f"there are {len(file_data['data'])} prompts in training data.")

# save the data to json file
output_path = f'{dataset}_{task}_{item_indexing}_train.json'

with open(os.path.join(data_path, dataset, output_path), 'w') as openfile:
json.dump(file_data, openfile)


if __name__ == "__main__":
fire.Fire(main)

附录utils.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
import math
import os
import re
import numpy as np
import torch
from torch.utils.data import Dataset


def get_dict_from_lines(lines):
"""
Used to get user or item map from lines loaded from txt file.
"""
index_map = dict()
for line in lines:
info = line.split(" ")
index_map[info[0]] = info[1]
return index_map


def read_line(path):
if not os.path.exists(path):
raise FileNotFoundError
lines = []
with open(path, 'r') as fd:
for line in fd:
lines.append(line.rstrip('\n'))
return lines


def write_dict_2_file(path, write_dict):
with open(path, 'w') as out:
for user, items in write_dict.items():
if type(items) == list:
out.write(user + ' ' + ' '.join(items) + '\n')
else:
out.write(user + ' ' + str(items) + '\n')


class EvaluationDataset(Dataset):
def __init__(self, dataset, tokenizer, cutoff):
super().__init__()
self.input = tokenizer(
dataset['input'], padding="longest", truncation=True, max_length=cutoff
)
self.output = tokenizer(
dataset['output'], padding="longest", truncation=True, max_length=cutoff
)

def __len__(self):
return len(self.input["input_ids"])

def __getitem__(self, index):
return {
"input_ids": torch.tensor(self.input["input_ids"][index]),
"attention_mask": torch.tensor(self.input["attention_mask"][index]),
'label': torch.tensor(self.output["input_ids"][index])
}


def load_prompt_template(path, task_list):
"""
Load prompt template from the file. Keep training tasks only.
Input:
- path: The path for prompt template txt file.
- task_list: A list of required tasks.
Return:
- prompt_templates: a dictionary of prompt templates. e.g., {task: {'seen': {'0': {'Input': template_input, 'Output': template_output}}}}

"""

if not os.path.exists(path):
raise FileNotFoundError
prompt_info = read_line(path)
prompt_templates = dict()
for prompt in prompt_info:
t = [sens.strip() for sens in prompt.split(';')]
if t[0] not in task_list:
continue
if t[0] not in prompt_templates:
prompt_templates[t[0]] = dict()
if t[1] not in prompt_templates[t[0]]:
prompt_templates[t[0]][t[1]] = dict()
num = len(prompt_templates[t[0]][t[1]])
prompt_templates[t[0]][t[1]][str(num)] = dict()
prompt_templates[t[0]][t[1]][str(num)]['Input'] = t[2]
prompt_templates[t[0]][t[1]][str(num)]['Output'] = t[3]
return prompt_templates


def generate_user_map(user_sequence_dict):
"""
generate user map based on user sequence dict.
"""
user_map = dict()
for user in user_sequence_dict.keys():
user_map[user] = str(len(user_map) + 1)
return user_map


def reindex(user_sequence_dict, user_map, item_map):
"""
reindex the given user sequence dict by given user map and item map
"""
reindex_user_sequence_dict = dict()
for user in user_sequence_dict:
uid = user_map[user]
items = user_sequence_dict[user]
reindex_user_sequence_dict[uid] = [item_map[i] for i in items]

return reindex_user_sequence_dict


def construct_user_sequence_dict(user_sequence):
"""
Convert a list of string to a user sequence dict. user as key, item list as value.
"""

user_seq_dict = dict()
for line in user_sequence:
user_seq = line.split(" ")
user_seq_dict[user_seq[0]] = user_seq[1:]
return user_seq_dict


def sequential_indexing(data_path, dataset, user_sequence_dict, order):
"""
Use sequential indexing method to index the given user seuqnece dict.
"""
user_index_file = os.path.join(data_path, dataset, 'user_indexing.txt')
item_index_file = os.path.join(data_path, dataset, f'item_sequential_indexing_{order}.txt')
reindex_sequence_file = os.path.join(data_path, dataset, f'user_sequence_sequential_indexing_{order}.txt')

if os.path.exists(reindex_sequence_file):
user_sequence = read_line(reindex_sequence_file)

item_info = read_line(item_index_file)
item_map = get_dict_from_lines(item_info)

return construct_user_sequence_dict(user_sequence), item_map

# For user index, load from txt file if already exists, otherwise generate from user sequence and save.
if os.path.exists(user_index_file):
user_info = read_line(user_index_file)
user_map = get_dict_from_lines(user_info)
else:
user_map = generate_user_map(user_sequence_dict)
write_dict_2_file(user_index_file, user_map)

# For item index, load from txt file if already exists, otherwise generate from user sequence and save.
if os.path.exists(item_index_file):
item_info = read_line(item_index_file)
item_map = get_dict_from_lines(item_info)
else:
item_map = dict()
if order == 'original':
user_list = user_sequence_dict.keys()
elif order == 'short2long':
user_list = sorted(user_sequence_dict, key=lambda x: len(user_sequence_dict[x]), reverse=False)
elif order == 'long2short':
user_list = sorted(user_sequence_dict, key=lambda x: len(user_sequence_dict[x]), reverse=True)

for user in user_list:
items = user_sequence_dict[user][:-2]
for item in items:
if item not in item_map:
item_map[item] = str(len(item_map) + 1001)
for user in user_list:
items = user_sequence_dict[user][-2:]
for item in items:
if item not in item_map:
item_map[item] = str(len(item_map) + 1001)
write_dict_2_file(item_index_file, item_map)

reindex_user_sequence_dict = reindex(user_sequence_dict, user_map, item_map)
write_dict_2_file(reindex_sequence_file, reindex_user_sequence_dict)
return reindex_user_sequence_dict, item_map


def get_info_from_prompt(prompt_templates):
"""
Extract the require information from the prompt templates.
Input:
- prompt_templates: a dictionary of prompt templates.
Output:
- info: a list of required information.
"""

info = []
for task in prompt_templates:
for see in prompt_templates[task]:
for i in prompt_templates[task][see]:
info += re.findall(r'\{.*?\}', prompt_templates[task][see][i]['Input'])
info += re.findall(r'\{.*?\}', prompt_templates[task][see][i]['Output'])
info = [i[1:-1] for i in set(info)]
return info


def check_task_prompt(prompt_templates, task_list):
"""
Check if all tasks have prompt templates. Raise Error if training tasks have no prompt.
Input:
- prompt_templates: A dictionary of prompt templates.
- task_list: A list of training tasks.
"""
for task in task_list:
assert task in prompt_templates, f"No prompt for {task} task"


def evaluation_results(predictions, targets, scores, k):
results = []
batch_length = len(targets)
for b in range(batch_length):
one_batch_sequence = predictions[
b * k: (b + 1) * k
]
one_batch_score = scores[
b * k: (b + 1) * k
]
pairs = [(a, b) for a, b in zip(one_batch_sequence, one_batch_score)]
sorted_pairs = sorted(pairs, key=lambda x: x[1], reverse=True)
gt = targets[b]
one_results = []
for sorted_pred in sorted_pairs:
if sorted_pred[0] == gt:
one_results.append(1)
else:
one_results.append(0)

results.append(one_results)
return results


def ndcg_at_k(relevance, k):
"""
Since we apply leave-one-out, each user only have one ground truth item, so the idcg would be 1.0
"""
ndcg = 0.0
for row in relevance:
rel = row[:k]
one_ndcg = 0.0
for i in range(len(rel)):
one_ndcg += rel[i] / math.log(i + 2, 2)
ndcg += one_ndcg
return ndcg


def hit_at_k(relevance, k):
correct = 0.0
for row in relevance:
rel = row[:k]
if sum(rel) > 0:
correct += 1
return correct


def get_metrics_results(rel_results, metrics):
res = []
for m in metrics:
if m.lower().startswith('hit'):
k = int(m.split('@')[1])
res.append(hit_at_k(rel_results, k))
elif m.lower().startswith('ndcg'):
k = int(m.split('@')[1])
res.append(ndcg_at_k(rel_results, k))

return np.array(res)


def load_test(reindex_user_seq_dict, info, dataset, his_prefix, max_his, his_sep):
data_samples = []
for user in reindex_user_seq_dict:
items = reindex_user_seq_dict[user]
one_sample = dict()
one_sample['dataset'] = dataset
one_sample['user_id'] = user
if his_prefix > 0:
one_sample['target'] = 'item_' + items[-1]
else:
one_sample['target'] = items[-1]
if 'history' in info:
history = items[:-1]
if max_his > 0:
history = history[-max_his:]
if his_prefix > 0:
one_sample['history'] = his_sep.join(["item_" + item_idx for item_idx in history])
else:
one_sample['history'] = his_sep.join(history)
data_samples.append(one_sample)
return data_samples


def load_validation(reindex_user_seq_dict, info, dataset, his_prefix, max_his, his_sep):
data_samples = []
for user in reindex_user_seq_dict:
items = reindex_user_seq_dict[user]
one_sample = dict()
one_sample['dataset'] = dataset
one_sample['user_id'] = user
if his_prefix > 0:
one_sample['target'] = 'item_' + items[-2]
else:
one_sample['target'] = items[-2]
if 'history' in info:
history = items[:-2]
if max_his > 0:
history = history[-max_his:]
if his_prefix > 0:
one_sample['history'] = his_sep.join(["item_" + item_idx for item_idx in history])
else:
one_sample['history'] = his_sep.join(history)
data_samples.append(one_sample)
return data_samples

(3)生成测试、验证数据
文件generate_dataset_eval.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import json
import os
import sys

import fire

sys.path.append('../')
from utils import sequential_indexing, load_prompt_template, check_task_prompt, get_info_from_prompt
from utils import construct_user_sequence_dict, read_line, load_test, load_validation


def main(data_path: str, item_indexing: str, task: str, dataset: str, prompt_file: str, sequential_order: str,
max_his: int, his_sep: str, his_prefix: int, skip_empty_his: int,
mode: str, prompt: str):
file_data = dict()
file_data['arguments'] = {
"data_path": data_path, "item_indexing": item_indexing, "task": task,
"dataset": dataset, "prompt_file": prompt_file, "sequential_order": sequential_order,
"max_his": max_his, "his_sep": his_sep, "his_prefix": his_prefix, "skip_empty_his": skip_empty_his,
"mode": mode, "prompt": prompt
}
file_data['data'] = []
tasks = list(task)

user_sequence = read_line(os.path.join(data_path, dataset, 'user_sequence.txt'))
user_sequence_dict = construct_user_sequence_dict(user_sequence)
reindex_user_seq_dict, item_map = sequential_indexing(data_path, dataset,
user_sequence_dict, sequential_order)

# get prompt
prompt_ = load_prompt_template(prompt_file, tasks)
info = get_info_from_prompt(prompt_)
check_task_prompt(prompt_, tasks)

# Load data samples
if mode == 'validation':
data_samples = load_validation(reindex_user_seq_dict, info, dataset, his_prefix, max_his, his_sep)
prompt_info = prompt.split(':')
output_path = f'{dataset}_{task}_{item_indexing}_validation_{prompt}.json'
elif mode == 'test':
data_samples = load_test(reindex_user_seq_dict, info, dataset, his_prefix, max_his, his_sep)
prompt_info = prompt.split(':')
output_path = f'{dataset}_{task}_{item_indexing}_test_{prompt}.json'
else:
raise NotImplementedError

# construct sentences
for i in range(len(data_samples)):
one_sample = data_samples[i]
for t in tasks:
datapoint = {'task': dataset + t,
'instruction': prompt_[t][prompt_info[0]][prompt_info[1]]['Input'],
'input': prompt_[t][prompt_info[0]][prompt_info[1]]['Input'].format(**one_sample),
'output': prompt_[t][prompt_info[0]][prompt_info[1]]['Output'].format(**one_sample)}
file_data['data'].append(datapoint.copy())

with open(os.path.join(data_path, dataset, output_path), 'w') as openfile:
json.dump(file_data, openfile)


if __name__ == "__main__":
fire.Fire(main)

3.2 模型预训练

基于 T5 基地
文件t5_pre-train.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
import fire
import transformers
from datasets import load_dataset
from transformers import (
T5Config,
T5ForConditionalGeneration,
AutoTokenizer,
Trainer,
TrainingArguments
)


def main(backbone: str, data_path: str, item_indexing: str, task: str, dataset: str,
valid_prompt: str, cutoff: int, model_dir: str, batch_size: int, valid_select: int,
epochs: int, lr: float, warmup_steps: int, gradient_accumulation_steps: int,
logging_steps: int, optim: str, eval_steps: int, save_steps: int, save_total_limit: int):

config = T5Config.from_pretrained(backbone)
model = T5ForConditionalGeneration.from_pretrained(backbone, config=config)
tokenizer = AutoTokenizer.from_pretrained(backbone)

train_data_file = os.path.join(data_path, dataset,
f'{dataset}_{task}_{item_indexing}_train.json')
valid_data_file = os.path.join(data_path, dataset,
f'{dataset}_{task}_{item_indexing}_validation_{valid_prompt}.json')
train_data = load_dataset("json", data_files=train_data_file, field='data')
valid_data = load_dataset("json", data_files=valid_data_file, field='data')

def tokenize(prompt, add_eos_token=True):
result = tokenizer(
prompt, truncation=True, max_length=cutoff, padding=False, return_tensors=None,
)
if (isinstance(result["input_ids"][-1], int) and result["input_ids"][-1] != tokenizer.eos_token_id
and len(result["input_ids"]) < cutoff
and add_eos_token
):
result["input_ids"].append(tokenizer.eos_token_id)
result["attention_mask"].append(1)
elif isinstance(result["input_ids"][-1], list) and add_eos_token:
for i in range(len(result['input_ids'])):
if result["input_ids"][i][-1] != tokenizer.eos_token_id and len(result["input_ids"][i]) < cutoff:
result["input_ids"][i].append(tokenizer.eos_token_id)
result["attention_mask"][i].append(1)
result["labels"] = result["input_ids"].copy()
return result

def process_func(datapoint):
encoding = tokenize(datapoint['input'], add_eos_token=True)
labels = tokenize(datapoint['output'], add_eos_token=True)
encoding['labels'] = labels['input_ids'].copy()
# return encoding
return {**datapoint, **encoding}

tokenizer.pad_token_id = (
0 # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left"
train_set = train_data['train'].shuffle().map(process_func, batched=True)
valid_set = valid_data['train'].shuffle().map(process_func, batched=True)
output_dir = os.path.join(model_dir, dataset, item_indexing, backbone)
trainer = Trainer(
model=model,
train_dataset=train_set,
eval_dataset=valid_set if valid_select > 0 else None,
args=TrainingArguments(
per_device_train_batch_size=batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
warmup_steps=warmup_steps,
num_train_epochs=epochs,
learning_rate=lr,
logging_steps=logging_steps,
optim=optim,
evaluation_strategy="steps" if valid_select > 0 else "no",
save_strategy="steps",
eval_steps=eval_steps if valid_select > 0 else None,
save_steps=save_steps,
output_dir=output_dir,
save_total_limit=save_total_limit,
load_best_model_at_end=True if valid_select > 0 else False,
group_by_length=False,
),
data_collator=transformers.DataCollatorForSeq2Seq(
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
),
)
trainer.train() # 进行模型训练
model.save_pretrained(output_dir) # 保存预训练好的模型
tokenizer.save_pretrained(output_dir) # 保存token


if __name__ == "__main__":
fire.Fire(main)

文件pre-train.sh

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#!/bin/bash

dir_path="../logs/mind/"

if [ ! -d "$dir_path" ]; then
mkdir -p "$dir_path"
fi

PYTORCH_ENABLE_MPS_FALLBACK=1 torchrun t5_pre-train.py --item_indexing sequential \
--task sequential,straightforward --dataset mind --epochs 1 --batch_size 1024 \
--backbone t5-small --cutoff 1024 --data_path /Users/liuqiang/Desktop/code/llm4rec/llm4rec_abc/src/basic_skills/train-llm/data \
--valid_prompt seen:0 --model_dir /Users/liuqiang/Desktop/code/llm4rec/llm4rec_abc/src/basic_skills/train-llm/models \
--lr 1e-3 --valid_select 1 --warmup_steps 100 --gradient_accumulation_steps 10 --logging_steps 10 --optim 'adamw_torch' \
--eval_steps 200 --save_steps 200 --save_total_limit 3

3.3 模型推理与验证

文件t5_evaluate.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import logging
import os
import sys

import fire
import numpy as np
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
T5ForConditionalGeneration,
AutoTokenizer,
)

sys.path.append('../')
from utils import EvaluationDataset, evaluation_results, get_metrics_results


def main(log_dir: str, checkpoint_path: str, data_path: str, item_indexing: str, task: str,
dataset: str, cutoff: int, test_prompt: str, eval_batch_size: int, metrics: str):
# setup
log_file = os.path.join(log_dir, dataset,
checkpoint_path.replace('.', '').replace('/', '_') + '.log')

for handler in logging.root.handlers[:]:
logging.root.removeHandler(handler)
logging.basicConfig(filename=log_file, level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))

model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.pad_token_id = (
0 # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left"
# load test data
test_data_file = os.path.join(data_path, dataset,
f'{dataset}_{task}_{item_indexing}_test_{test_prompt}.json')
logging.info("test_data_file=" + test_data_file)
test_data = load_dataset("json", data_files=test_data_file, field='data')
model.eval()
metrics = list(metrics)
generate_num = max([int(m.split('@')[1]) for m in metrics])
task_list = np.unique(test_data['train']['task'])
for t in task_list:
logging.info(f'testing on {t}')
subset_data = test_data.filter(lambda example: example['task'] == t)
dataset = EvaluationDataset(subset_data['train'], tokenizer, cutoff)
dataloader = DataLoader(dataset, batch_size=eval_batch_size, shuffle=False)
test_total = 0
metrics_res = np.array([0.0] * len(metrics))
for batch in tqdm(dataloader):
"""
下面是一个batch的案例:
{'input_ids': tensor([[ 3, 21419, 12587, ..., 0, 0, 0],
...,
[ 3, 21419, 12587, ..., 0, 0, 0]]),

'attention_mask': tensor([[1, 1, 1, ..., 0, 0, 0],
...,
[1, 1, 1, ..., 0, 0, 0]]),

'label': tensor([[12587, 2118, 834, 22504, 2577, 1, 0],
[12587, 2118, 834, 19993, 4867, 1, 0],
...,
[12587, 2118, 834, 19993, 5062, 1, 0]])}
"""

prediction = model.generate( # 大模型模型生成函数
input_ids=batch["input_ids"], # torch.LongTensor of shape (batch_size, sequence_length)
attention_mask=batch["attention_mask"], # torch.FloatTensor of shape (batch_size, sequence_length)
max_length=30,
num_beams=generate_num,
num_return_sequences=generate_num,
output_scores=True,
return_dict_in_generate=True,
)
output_ids = batch['label']
prediction_ids = prediction["sequences"] # 利用大模型进行预测,输出的向量化的,需要解码
prediction_scores = prediction["sequences_scores"]
gold_sents = tokenizer.batch_decode( # 用户真实的点击记录
output_ids, skip_special_tokens=True
)
generated_sents = tokenizer.batch_decode( # 大模型预测的点击记录
prediction_ids, skip_special_tokens=True
)
rel_results = evaluation_results(generated_sents, gold_sents, prediction_scores, generate_num)
test_total += len(rel_results)
metrics_res += get_metrics_results(rel_results, metrics)

metrics_res /= test_total
for i in range(len(metrics)):
logging.info(f'{metrics[i]}: {metrics_res[i]}')


if __name__ == "__main__":
fire.Fire(main)

执行t5_evaluate.sh

1
2
3
4
5
6
7
8
9
10
11
12
13
14

python t5_evaluate.py --dataset mind --task sequential,straightforward --item_indexing sequential --backbone t5-small \
--checkpoint_path /Users/liuqiang/Desktop/code/llm4rec/llm4rec_abc/src/basic_skills/train-llm/models/mind/sequential/t5-small/ \
--test_prompt seen:0 --log_dir '../logs' \
--data_path /Users/liuqiang/Desktop/code/llm4rec/llm4rec_abc/src/basic_skills/train-llm/data \
--cutoff 1024 --eval_batch_size 32 --metrics hit@5,hit@10,ndcg@5,ndcg@10


python t5_evaluate.py --dataset mind --task sequential,straightforward --item_indexing sequential --backbone t5-small \
--checkpoint_path /Users/liuqiang/Desktop/code/llm4rec/llm4rec_abc/src/basic_skills/train-llm/models/mind/sequential/t5-small/ \
--test_prompt unseen:0 --log_dir '../logs' \
--data_path /Users/liuqiang/Desktop/code/llm4rec/llm4rec_abc/src/basic_skills/train-llm/data \
--cutoff 1024 --eval_batch_size 32 --metrics hit@5,hit@10,ndcg@5,ndcg@10