阅读书籍: 《大模型推荐系统 | 算法原理、代码实战与案例分析》刘强
数据集来源:

注:大型文件包工具git-lfs、强化学习训练工具trl

大模型在电商推荐中的应用

1、冷启动

1.1 数据准备

由于涉及冷启动商品生成模拟的用户行为,因此需要确定哪些是冷启动商品。

  • 将用户行为数据按照时间排序,前 70%作为训练样本,后 30%作为测试样本。
  • 只在测试数据中出现但不在训练数据中出现的商品就被认为是冷启动商品。

代码cold_start/utils/utils.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import csv
import json

import pandas as pd

TRAIN_RATIO = 0.7


# 读取相关数据
def parse(path):
g = open(path, 'r')
for row in g:
yield json.loads(row)


# 将数据存为 DataFrame 格式,方便后续处理
def get_df(path):
i = 0
df_ = {}
for d in parse(path):
df_[i] = d
i += 1
return pd.DataFrame.from_dict(df_, orient='index')


def get_cold_start_items(path_review: str = "../data/amazon_review/beauty/All_Beauty_5.json") -> set[str]:
"""
将用户行为数据按照时间升序排列,取后面的30%的数据,该数据中的item在前面的70%中不存在,就认为是冷启动数据
:param path_review: 用户行为数据目录
:return: 冷启动物品
"""

df_view = get_df(path_review)

# 对unixReviewTime升序排序
df_view.sort_values('unixReviewTime', ascending=True, inplace=True)
df_view = df_view.reset_index(drop=True)

rows_num = df_view.shape[0]
train_num = int(rows_num * 0.7)

train_df = df_view.head(train_num)
test_df = df_view.iloc[train_num:]

train_items = set(train_df['asin'].unique()) # 71个
test_items = set(test_df['asin'].unique()) # 44个

cold_start_items = test_items.difference(train_items) # 14个

return cold_start_items


def get_user_history(path_review: str = "../data/amazon_review/beauty/All_Beauty_5.json",
data_type: str = "train") -> dict:
"""
将用户行为数据按照时间升序排列,取用户行为字典
:param data_type: 是取前面70%的训练数据,还是后面30%的测试数据
:param path_review: 用户行为数据目录
:return: 用户行为历史
"""
df_view = get_df(path_review)

# 对unixReviewTime升序排序
df_view.sort_values('unixReviewTime', ascending=True, inplace=True)
df_view = df_view.reset_index(drop=True)

rows_num = df_view.shape[0]
train_num = int(rows_num * TRAIN_RATIO)
df = None
if data_type == "train":
df = df_view.head(train_num)
if data_type == "test":
df = df_view.iloc[train_num:]

grouped = df.groupby('reviewerID')
"""
>>> grouped.get_group('A105A034ZG9EHO')
overall verified reviewTime reviewerID asin style reviewerName reviewText summary unixReviewTime vote image
1246 5.0 True 07 6, 2014 A105A034ZG9EHO B0009RF9DW {'Size:': ' 180'} K. Mras yum Five Stars 1404604800 NaN NaN
1247 5.0 True 07 6, 2014 A105A034ZG9EHO B000FI4S1E NaN K. Mras yum Five Stars 1404604800 NaN NaN
1250 5.0 True 07 6, 2014 A105A034ZG9EHO B0012Y0ZG2 {'Size:': ' 180'} K. Mras yum Five Stars 1404604800 NaN NaN
1252 5.0 True 07 6, 2014 A105A034ZG9EHO B000URXP6E {'Size:': ' 180'} K. Mras yum Five Stars 1404604800 NaN NaN
1253 5.0 True 07 6, 2014 A105A034ZG9EHO B0012Y0ZG2 {'Size:': ' 180'} K. Mras yum Five Stars 1404604800 NaN NaN
"""
user_history_dict = {}
for name, group in grouped:
reviewerID = name
asin = group['asin']
user_history_dict[reviewerID] = set(asin)

return user_history_dict


def get_metadata_dict(path: str = '../data/amazon_review/beauty/meta_All_Beauty.json') -> dict:
"""
读取商品metadata数据,将商品的核心进行取出来,方便后面大模型使用
:param path: 商品metadata数据目录
:return: 商品信息字典
"""
item_dict = {} # meta_All_Beauty.json中获取每个item对应的信息
"""
{"category": [], "tech1": "", "description": ["Start Up combines citrus essential oils with gentle Alpha Hydroxy Acids to cleanse and refresh your face. The 5% AHA level is gentle enough for all skin types.", "", ""],
"fit": "", "title": "Kiss My Face Exfoliating Face Wash Start Up, 4 Fluid Ounce",
"also_buy": ["B000Z96JDI", "B00006IGL8", "B007C5X34G", "B00006IGLF", "B00213WCNC", "B00D1W1QXE", "B001FB5HZG", "B000FQ86RI", "B0012BSKBM", "B0085EVLRO", "B00A2EXVQE"],
"tech2": "", "brand": "Kiss My Face", "feature": [], "rank": [],
"also_view": ["B000Z96JDI", "B001FB5HZG", "B00213WCNC", "B00BBFOVO4", "B0085EVLRO"],
"details": {"\n Product Dimensions: \n ": "2.5 x 1.6 x 7 inches ; 4 ounces", "Shipping Weight:": "4 ounces", "ASIN: ": "B00006IGL2", "UPC:": "890795851488 701320351987 601669038184 793379218755 028367831938 787734768894 756769626417", "Item model number:": "1200040"},
"main_cat": "All Beauty", "similar_item": "", "date": "", "price": "",
"asin": "B00006IGL2", "imageURL": ["https://images-na.ssl-images-amazon.com/images/I/41i07fBAznL._SS40_.jpg",
"https://images-na.ssl-images-amazon.com/images/I/31W8DZRVD1L._SS40_.jpg"], "imageURLHighRes": ["https://images-na.ssl-images-amazon.com/images/I/41i07fBAznL.jpg", "https://images-na.ssl-images-amazon.com/images/I/31W8DZRVD1L.jpg"]}

"""
with open(path, 'r') as file:
reader = csv.reader(file, delimiter='\n')
for row in reader:
j = json.loads(row[0])
item_id = j['asin']
title = j['title']
brand = j['brand']
description = j['description']
price = j['price']
if price != "" and '$' not in price and len(price) > 10: # 处理一些异常数据情况
price = ""
item_info = {
"title": title,
"brand": brand,
"description": description,
"price": price
}
item_dict[item_id] = item_info
return item_dict

将测试数据生成为微调样本。
代码cold_start/data-process/generate_finetune_data.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import importlib
import json
import sys

sys.path.append('../utils')
utils = importlib.import_module('utils')

TRAIN_RATIO = 0.7

instruction = ("You are a product expert who predicts which products "
"users prefer based on your professional knowledge.")


def formatting_input(history, candidate):
input = f"""The user purchased the following beauty products(in JSON format):

{history}

Predict if the user will prefer to purchase the following beauty candidate list(in JSON format):

{candidate}

You can choice none, one or more, your output must be JSON format, you just need output item_id, the following is an
output example, A and B is product item_id.

["A", "B"]

Your output must in the candidate list, don't explain.
"""

return input


"""
按照如下格式构建训练数据集:
[
{
"instruction": "You are a product expert who predicts which products users prefer based on your professional knowledge.",
"input": "The user purchased the following beauty products(in JSON format):
[
{
"title": "Fruits & Passion Blue Refreshing Shower Gel - 6.7 fl. oz.",
"brand": "Fruits & Passion",
"price": "",
"item_id": "B000FI4S1E"
},
{
"title": "Yardley By Yardley Of London Unisexs Lay It On Thick Hand & Foot Cream 5.3 Oz",
"brand": "Yardley",
"price": "",
"item_id": "B0009RF9DW"
}
]

Predict if the user will prefer to purchase the following beauty candidate list(in JSON format):

[
{
"title": "Helen of Troy 1579 Tangle Free Hot Air Brush, White, 3/4 Inch Barrel",
"brand": "Helen Of Troy",
"price": "$28.70",
"item_id": "B000WYJTZG"
},
{
"title": "Dolce & Gabbana Compact Parfum, 0.05 Ounce",
"brand": "Dolce & Gabbana",
"price": "",
"item_id": "B019V2KYZS"
},
...
]
"output": '["B0012Y0ZG2","B000URXP6E"]'
},
...
]
"""


def generate_data(output_path: str = '../data/train.json'):
item_dict = utils.get_metadata_dict()
train_user_dict = utils.get_user_history(data_type="train")
cold_start_items = utils.get_cold_start_items()
action_items = set()
for _, items in train_user_dict.items():
action_items = action_items.union(items)
unique_items = action_items.difference(cold_start_items) # 这里是测试集中不在冷启动中的item集合

C = []
for item in unique_items:
info = item_dict[item]
info['item_id'] = item
if 'description' in info:
del info['description'] # description 字段太长了,消耗的token太多,剔除掉
C.append(info)
candidate = json.dumps(C, indent=4, ensure_ascii=False) # 这是所有训练集中不在冷启动id的item

data_list = []
for user, history in train_user_dict.items():
H = []
history = [item for item in history if item not in cold_start_items]
if len(history) > 1: # 该用户至少还剩余2个action items
train_num = int(len(history) * TRAIN_RATIO)
train_history = history[:train_num]
test_history = history[train_num:]
for h in train_history:
info = item_dict[h]
if 'description' in info:
del info['description'] # description 字段太长了,消耗的token太多,剔除掉
H.append(info)
HH = json.dumps(H, indent=4, ensure_ascii=False)
output = json.dumps(test_history, indent=4, ensure_ascii=False)
input = formatting_input(HH, candidate)
d = {
"instruction": instruction,
"input": input,
"output": output
}
data_list.append(d)

train_res = json.dumps(data_list, indent=4, ensure_ascii=False)
with open(output_path, 'a') as file_: # 将生成的训练数据保存起来
file_.write(train_res)


if __name__ == "__main__":
generate_data()

1.2 生成冷启动商品的行为样本

如何为冷启动商品生成模拟的用户行为呢?

  1. 在训练集中随机找20%的用户
  2. 在冷启动商品中随机找两个商品
  3. 让大模型基于用户过往行为,选择一个用户可能喜欢的商品

代码cold_start/data-process/generate_samples.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import importlib
import json
import os
import random
import sys
import time

from openai import OpenAI

sys.path.append('../utils')
utils = importlib.import_module('utils')

from dotenv_vault import load_dotenv # pip install --upgrade python-dotenv-vault

load_dotenv() # https://vault.dotenv.org/ui/ui1

MOONSHOT_API_KEY = os.getenv("MOONSHOT_API_KEY")

instruction = ("You are a product expert who predicts which of the two products "
"users prefer based on your professional knowledge.")


def formatting_prompt(History, item_a, item_b):
prompt = f"""The user purchased the following beauty products in JSON format:

{History}

Predict if the user will prefer to purchase product A or B in the next.

A is:

{item_a}

B is:

{item_b}

Your answer must be A or B, don't explain.
"""

return prompt


def generate_cold_start_samples(store_path: str = '../data/cold_start_action_sample.json'):
item_dict = utils.get_metadata_dict()
user_history = utils.get_user_history(data_type="train")
cold_start_items = utils.get_cold_start_items()

generated_samples = []
i = 0
for user, history in user_history.items():
rd = random.random()
if rd < 0.2: # 随机选择20%的用户
random_2_elements = random.sample(list(cold_start_items), 2)
H = []
for h in history:
info = item_dict[h]
H.append(info)
HH = json.dumps(H, indent=4, ensure_ascii=False)
A = item_dict[random_2_elements[0]]
B = item_dict[random_2_elements[1]]
AA = json.dumps(A, indent=4, ensure_ascii=False)
BB = json.dumps(B, indent=4, ensure_ascii=False)
prom = formatting_prompt(HH, AA, BB)

client = OpenAI(
api_key=MOONSHOT_API_KEY,
base_url="https://api.moonshot.cn/v1",
)
llm_response = client.chat.completions.create(
model="moonshot-v1-32k", # moonshot-v1-8k 、moonshot-v1-32k、moonshot-v1-128k
messages=[
{
"role": "system",
"content": instruction,
},
{"role": "user", "content": prom},
],
temperature=0.1,
stream=False,
)
choice = llm_response.choices[0].message.content.strip()
sample = {}
if choice == "A":
sample = {
"user": user,
"item": random_2_elements[0]
}
generated_samples.append(sample)
if choice == "B":
sample = {
"user": user,
"item": random_2_elements[1]
}
i += 1
print("-------------- " + str(i) + " -----------------")
print(json.dumps(sample, indent=4, ensure_ascii=False))
generated_samples.append(sample)
if i % 7 == 0:
time.sleep(1) # 避免moonshot认为调用太频繁不合法

res = json.dumps(generated_samples, indent=4, ensure_ascii=False)

with open(store_path, 'a') as file: # 将生成的训练数据保存起来
file.write(res)


if __name__ == "__main__":
generate_cold_start_samples()

一旦为冷启动商品生成了模拟行为,就可以加入真实用户行为数据中。
冷启动商品慢慢变热,就可以采用传统的推荐算法模型了。

1.3 上下文学习能力

冷启动召回:将用户在测试集中的行为作为用户的兴趣历史,然后将所有冷启动商品作为候选集,让大模型筛选出一些用户可能感兴趣的。

代码cold_start/item_cold_start_rec.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import json
import os
import time

import torch
from dotenv_vault import load_dotenv # pip install --upgrade python-dotenv-vault
from openai import OpenAI
from transformers import AutoTokenizer, AutoModelForCausalLM

from utils.utils import get_metadata_dict, get_user_history, get_cold_start_items

load_dotenv() # https://vault.dotenv.org/ui/ui1

MOONSHOT_API_KEY = os.getenv("MOONSHOT_API_KEY")

instruction = ("You are a product expert who predicts which products "
"users prefer based on your professional knowledge.")


def formatting_prompt(history, candidate):
prompt = f"""The user purchased the following beauty products(in JSON format):

{history}

Predict if the user will prefer to purchase the following beauty candidate list(in JSON format):

{candidate}

You can choice none, one or more, your output must be JSON format, you just need output item_id, the following is an
output example, A and B is product item_id.

["A", "B"]

Your output must in the candidate list, don't explain.
"""

return prompt


def llm_api_cold_start_rec(store_path: str = 'data/llm_api_rec.json'):
item_dict = get_metadata_dict()
train_user_dict = get_user_history(data_type="train")
test_user_dict = get_user_history(data_type="test")
common_users = set(train_user_dict.keys()).intersection(set(test_user_dict.keys()))
cold_start_items = get_cold_start_items()

generated_rec = []
print("total user number = " + str(len(common_users)))
i = 0
for user in common_users:
H = []
for h in train_user_dict[user]:
info = item_dict[h]
if 'description' in info:
del info['description'] # description 字段太长了,消耗的token太多,剔除掉
H.append(info)
history = json.dumps(H, indent=4, ensure_ascii=False)
C = []
for item in cold_start_items:
info = item_dict[item]
info['item_id'] = item
if 'description' in info:
del info['description'] # description 字段太长了,消耗的token太多,剔除掉
C.append(info)
candidate = json.dumps(C, indent=4, ensure_ascii=False)

prom = formatting_prompt(history, candidate)
client = OpenAI(
api_key=MOONSHOT_API_KEY,
base_url="https://api.moonshot.cn/v1",
)
llm_response = client.chat.completions.create(
model="moonshot-v1-32k", # moonshot-v1-8k 、moonshot-v1-32k、moonshot-v1-128k
messages=[
{
"role": "system",
"content": instruction,
},
{"role": "user", "content": prom},
],
temperature=0.1,
stream=False,
)
content = llm_response.choices[0].message.content.strip()
rec = {
"user": user,
"rec": content
}
i += 1
print("-------------- " + str(i) + " -----------------")
print(json.dumps(rec, indent=4, ensure_ascii=False))
generated_rec.append(rec)
if i % 7 == 0:
time.sleep(1) # 避免moonshot认为调用太频繁不合法

res = json.dumps(generated_rec, indent=4, ensure_ascii=False)

with open(store_path, 'a') as file: # 将生成的训练数据保存起来
file.write(res)


def openllm_cold_start_rec(model_path: str = '/Users/liuqiang/Desktop/code/llm/models/Qwen1.5-4B',
store_path: str = 'data/openllm_rec.json'):
item_dict = get_metadata_dict()
train_user_dict = get_user_history(data_type="train")
test_user_dict = get_user_history(data_type="test")
common_users = set(train_user_dict.keys()).intersection(set(test_user_dict.keys()))
cold_start_items = get_cold_start_items()

model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.padding_side = 'right'

generated_rec = []
print("total user number = " + str(len(common_users)))
i = 0
for user in common_users:
H = []
for h in train_user_dict[user]:
info = item_dict[h]
if 'description' in info:
del info['description'] # description 字段太长了,消耗的token太多,剔除掉
H.append(info)
history = json.dumps(H, indent=4, ensure_ascii=False)
C = []
for item in cold_start_items:
info = item_dict[item]
info['item_id'] = item
if 'description' in info:
del info['description'] # description 字段太长了,消耗的token太多,剔除掉
C.append(info)
candidate = json.dumps(C, indent=4, ensure_ascii=False)

input = formatting_prompt(history, candidate)

prompt = f"""### Instruction:
{instruction}

### Input:
{input}

### Response:
"""

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
outputs = model.generate(input_ids=input_ids.to('mps'),
max_new_tokens=1500, pad_token_id=tokenizer.eos_token_id)
predict_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0][len(prompt):]

rec = {
"user": user,
"rec": predict_output
}
i += 1
print("-------------- " + str(i) + " -----------------")
print(json.dumps(rec, indent=4, ensure_ascii=False))
generated_rec.append(rec)
if i % 7 == 0:
time.sleep(1) # 避免moonshot认为调用太频繁不合法

res = json.dumps(generated_rec, indent=4, ensure_ascii=False)

with open(store_path, 'a') as file: # 将生成的训练数据保存起来
file.write(res)


if __name__ == "__main__":
llm_api_cold_start_rec()
openllm_cold_start_rec(model_path='./models', store_path='data/openllm_finetune_rec.json')
openllm_cold_start_rec()

1.4 模型微调

代码cold_start/model_finetune.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import os
from typing import List

import fire
import torch
from datasets import load_dataset
from peft import (
LoraConfig,
get_peft_model,
prepare_model_for_int8_training,
set_peft_model_state_dict, PeftModel,
)
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, DataCollatorForSeq2Seq, TrainingArguments

from utils.prompter import Prompter


def train(
# model/data params
base_model: str = "", # the only required argument
data_path: str = "./data/train.json",
output_dir: str = "./models",
# training hyperparams
batch_size: int = 128,
micro_batch_size: int = 4,
num_epochs: int = 3,
learning_rate: float = 3e-4,
cutoff_len: int = 256,
val_set_size: int = 2000,
# lora hyperparams
lora_r: int = 8,
lora_alpha: int = 16,
lora_dropout: float = 0.05,
lora_target_modules: List[str] = [
"q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"
],
# llm hyperparams
train_on_inputs: bool = True, # if False, masks out inputs in loss
add_eos_token: bool = False,
group_by_length: bool = False, # faster, but produces an odd training loss curve
# wandb params
wandb_project: str = "",
wandb_run_name: str = "",
wandb_watch: str = "", # options: false | gradients | all
wandb_log_model: str = "", # options: false | true
resume_from_checkpoint: str = None, # either training checkpoint or final adapter
prompt_template_name: str = "alpaca", # The prompt template to use, will default to alpaca.
):
assert (
base_model
), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"
gradient_accumulation_steps = batch_size // micro_batch_size

prompter = Prompter(prompt_template_name)

device_map = "auto"
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1
if ddp:
device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
gradient_accumulation_steps = gradient_accumulation_steps // world_size

# Check if parameter passed or if set within environ
use_wandb = len(wandb_project) > 0 or (
"WANDB_PROJECT" in os.environ and len(os.environ["WANDB_PROJECT"]) > 0
)
# Only overwrite environ if wandb param passed
if len(wandb_project) > 0:
os.environ["WANDB_PROJECT"] = wandb_project
if len(wandb_watch) > 0:
os.environ["WANDB_WATCH"] = wandb_watch
if len(wandb_log_model) > 0:
os.environ["WANDB_LOG_MODEL"] = wandb_log_model

model = AutoModelForCausalLM.from_pretrained(
base_model,
torch_dtype=torch.float16,
device_map=device_map,
)

tokenizer = AutoTokenizer.from_pretrained(base_model)

tokenizer.pad_token_id = (
0 # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left" # Allow batched inference

def tokenize(prompt, add_eos_token=True):
# there's probably a way to do this with the tokenizer settings
# but again, gotta move fast
result = tokenizer(
prompt,
truncation=True,
max_length=cutoff_len,
padding=False,
return_tensors=None,
)
if (
result["input_ids"][-1] != tokenizer.eos_token_id
and len(result["input_ids"]) < cutoff_len
and add_eos_token
):
result["input_ids"].append(tokenizer.eos_token_id)
result["attention_mask"].append(1)

result["labels"] = result["input_ids"].copy()

return result

def generate_and_tokenize_prompt(data_point):
full_prompt = prompter.generate_prompt(
data_point["instruction"],
data_point["input"],
data_point["output"],
)
tokenized_full_prompt = tokenize(full_prompt)
if not train_on_inputs:
user_prompt = prompter.generate_prompt(
data_point["instruction"], data_point["input"]
)
tokenized_user_prompt = tokenize(
user_prompt, add_eos_token=add_eos_token
)
user_prompt_len = len(tokenized_user_prompt["input_ids"])

if add_eos_token:
user_prompt_len -= 1

tokenized_full_prompt["labels"] = [
-100
] * user_prompt_len + tokenized_full_prompt["labels"][
user_prompt_len:
] # could be sped up, probably
return tokenized_full_prompt

model = prepare_model_for_int8_training(model)

config = LoraConfig(
r=lora_r,
lora_alpha=lora_alpha,
target_modules=lora_target_modules,
lora_dropout=lora_dropout,
bias="none",
task_type="CAUSAL_LM",
)
peft_model = get_peft_model(model, config)

if data_path.endswith(".json") or data_path.endswith(".jsonl"): # 数据是JSON或者JSONL格式
data = load_dataset("json", data_files=data_path)
else:
data = load_dataset(data_path)

if resume_from_checkpoint:
# Check the available weights and load them
checkpoint_name = os.path.join(
resume_from_checkpoint, "pytorch_model.bin"
) # Full checkpoint
if not os.path.exists(checkpoint_name):
checkpoint_name = os.path.join(
resume_from_checkpoint, "adapter_model.bin"
) # only LoRA model - LoRA config above has to fit
resume_from_checkpoint = (
False # So the trainer won't try loading its state
)
# The two files above have a different name depending on how they were saved, but are actually the same.
if os.path.exists(checkpoint_name):
print(f"Restarting from {checkpoint_name}")
adapters_weights = torch.load(checkpoint_name)
set_peft_model_state_dict(peft_model, adapters_weights)
else:
print(f"Checkpoint {checkpoint_name} not found")

peft_model.print_trainable_parameters() # Be more transparent about the % of trainable params.

if val_set_size > 0:
train_val = data["train"].train_test_split(
test_size=val_set_size, shuffle=True, seed=42
)
train_data = (
train_val["train"].shuffle().map(generate_and_tokenize_prompt)
)
val_data = (
train_val["test"].shuffle().map(generate_and_tokenize_prompt)
)
else:
train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
val_data = None

if not ddp and torch.cuda.device_count() > 1:
# keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
peft_model.is_parallelizable = True
peft_model.model_parallel = True

trainer = Trainer(
model=peft_model,
train_dataset=train_data,
eval_dataset=val_data,
args=TrainingArguments(
per_device_train_batch_size=micro_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
warmup_steps=100,
num_train_epochs=num_epochs,
learning_rate=learning_rate,
logging_steps=10,
optim="adamw_torch",
evaluation_strategy="steps" if val_set_size > 0 else "no",
save_strategy="steps",
eval_steps=200 if val_set_size > 0 else None,
save_steps=200,
output_dir=output_dir,
save_total_limit=3,
load_best_model_at_end=True if val_set_size > 0 else False,
ddp_find_unused_parameters=False if ddp else None,
group_by_length=group_by_length,
report_to="wandb" if use_wandb else None,
run_name=wandb_run_name if use_wandb else None,
),
data_collator=DataCollatorForSeq2Seq(
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
),
)
peft_model.config.use_cache = False

trainer.train(resume_from_checkpoint=resume_from_checkpoint)

# lora权重保存
trainer.model.save_pretrained(output_dir) # 保存模型向量
tokenizer.save_pretrained(output_dir) # 保存token

# lora权重跟原始模型合并,并保存
model_to_merge = PeftModel.from_pretrained(
AutoModelForCausalLM.from_pretrained(base_model), output_dir)

merged_model = model_to_merge.merge_and_unload()
merged_model.save_pretrained(output_dir)


if __name__ == "__main__":
fire.Fire(train)

# 这里的模型finetune代码跟personalized_rec中的完全一样

1.5 模型效果评估

将平均精准率、平均召回率作为比较指标,下面是比较 1.3 中的上下文学习能力推荐的情况。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import importlib
import json
import sys

sys.path.append('utils')
utils = importlib.import_module('utils')

REC_MUM = 8


def precision(rec_list: list, action_list: list) -> float:
"""
计算单个用户推荐的精准度
:param rec_list: 算法的推荐列表
:param action_list: 用户实际的购买列表
:return: 精准度
"""
num = len(set(rec_list))
if num > 0:
return len(set(rec_list).intersection(set(action_list))) / num
else:
return 0.0


def recall(rec_list: list, action_list: list) -> float:
"""
计算单个用户推荐的召回率
:param rec_list: 算法的推荐列表
:param action_list: 用户实际的购买列表
:return: 召回率
"""
num = len(set(action_list))
if num > 0:
return len(set(rec_list).intersection(set(action_list))) / num
else:
return 0.0


def find_all_occurrences(s, char):
start = s.find(char)
indices = []
while start != -1:
indices.append(start)
start = s.find(char, start + 1)
return indices


def evaluate(data_path: str, model_type: str) -> (float, float):
test_user_dict = utils.get_user_history(data_type="test")
test_users = test_user_dict.keys()
j = ""
with open(data_path, 'r') as file:
j = file.read()
rec = json.loads(j)

common_num = 0 # 推荐的用户和实际测试集的用户的交集数量
acc_p = 0.0 # 累积精准度
acc_r = 0.0 # 累积召回率
for x in rec:
user = x['user']
if user in test_users:
common_num += 1
action_list = test_user_dict[user]
temp = x['rec']
rec_list = None
if model_type == 'llm_api':
rec_list = eval(temp)
elif model_type in ['openllm', 'openllm_finetune']: # 千问模型和微调的模型生成的结构比较复杂,需要特殊处理
# print(temp)
loc_list = find_all_occurrences(temp, '"item_id": "')
rec_list = []
for loc in loc_list:
item_id = temp[loc + len('"item_id": "'): loc + len('"item_id": "') + 10] # item_id长度为10
rec_list.append(item_id)
# print(rec_list)
rec_list = rec_list[:REC_MUM] # 最多推荐REC_MUM,避免不同模型推荐的数量不一样,对比不公平
p = precision(rec_list, action_list)
r = recall(rec_list, action_list)
acc_p += p
acc_r += r

avg_p = acc_p / common_num
avg_r = acc_r / common_num

return avg_p, avg_r


if __name__ == "__main__":
llm_api_avg_p, llm_api_avg_r = evaluate('data/llm_api_rec.json', model_type='llm_api')
openllm_avg_p, openllm_avg_r = evaluate('data/openllm_rec.json', model_type='openllm')
openllm_finetune_avg_p, openllm_finetune_avg_r = (
evaluate('data/openllm_finetune_rec.json', model_type='openllm_finetune'))

res = [
{
"llm_api_avg_p": llm_api_avg_p,
"llm_api_avg_r": llm_api_avg_r
},
{
"openllm_avg_p": openllm_avg_p,
"openllm_avg_r": openllm_avg_r
},
{
"openllm_finetune_avg_p": openllm_finetune_avg_p,
"openllm_finetune_avg_r": openllm_finetune_avg_r
}
]

print(json.dumps(res, indent=4, ensure_ascii=False))

最终效果发现:Qwen-4B微调模型的效果反而不如没有微调的。
可能原因:

  1. 微调样本数量太少,没有学习到重要的规律
  2. 提供的数据不够好,缺失某种特征,引入了噪声
  3. 模型参数比较少,根本难以解决这个问题

2、生成用户兴趣画像

3、生成个性化商品描述

4、猜你喜欢推荐

5、电商关联推荐

  • also_buy买了该商品的用户还买了
  • also_view浏览了该商品的用户还浏览了
  • 前者更强烈,相似度设置为2,后者设置为 1
  • 负样本的相似度设置为 0
  • 正负样本数量应该差不多

5.1 数据准备

构建训练、测试样本similar_rec/data-process/generate_funetune_data.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import json
import random

"""
按照如下格式构建训练、测试数据集:

{

"instruction": "You are a product expert who judges whether
two products are similar based on your professional knowledge.",

"input": "I will provide you with two product related introduction information, as follows(in JSON format):

[
{
"title": "SF221-Shaving Factory Straight Razor (Black), Shaving Factory Hand Made Shaving Brush, 100...",
"brand": "Shaving Factory",
"price": "$21.95",
"description": ["Start Up combines citrus essential oils with gentle Alpha Hydroxy Acids to cleanse and refresh
your face. The 5% AHA level is gentle enough for all skin types.", "", ""],
},
{
"title": "Loud 'N Clear&trade; Personal Sound Amplifier",
"brand": "idea village",
"price": "",
"description": ["Loud 'N Clear Personal Sound Amplifier allows you to turn up the volume on what people around
you are saying, listen at the level you want without disturbing others, hear a pin drop from across the room."],
}
]

Based on above information, please predict if these two products are similar. The similarity is between 0 and 2,
0 being lowest and 2 being highest. You just need to ranking the above product, do not explain the reason.

"output": "0"

}

"""

"""
构建训练集、测试集的思路:
基于metadata数据集中also_buy、also_view 字段,某个商品与also_buy、also_view中的商品认为是相似的,这些可以做为正样本。
但他们的相似度应该不一样,also_buy是更强烈的偏好,我们设置相似度为2,also_view设置为1
为了让训练样本更加平衡,可以随机选择两个商品对做为负样本,选择负样本的数量跟正样本差不多。负样本设置相似度为0。
下面就基于这个思路来进行处理。

"""

instruction = ("You are a product expert who judges whether "
"two products are similar based on your professional knowledge.")


def generate_data(out_path: str, item_dict: dict, test_ratio: float = 0.3):
data_list = []
# 构建正样本
for item in item_dict.keys():

info = item_dict[item]
title = info['title']
brand = info['brand']
price = info['price']
description = info['description']
also_view = info['also_view']
also_buy = info['also_buy']
_dict = {
"title": title,
"brand": brand,
"price": price,
"description": description
}
s = set(also_view).union(set(also_buy))
for i in s:
if i in item_dict:
i_dict = {
"title": item_dict[i]['title'],
"brand": item_dict[i]['brand'],
"price": item_dict[i]['price'],
"description": item_dict[i]['description']
}
positive_sample_pair = [_dict, i_dict]
formatted_input = json.dumps(positive_sample_pair, indent=4, ensure_ascii=False)
input = ("I will provide you with two product related introduction information, as follows(in JSON " +
"format):\n\n" +
formatted_input + "\n\n" +
"Based on above information, please predict if these two products are similar. The similarity " +
"is between 0 and 2, 0 being lowest and 2 being highest. You just need to ranking the above " +
"product, do not explain the reason.")
if i in also_buy:
output = "2"
else:
output = "1"
res_dic = {
"instruction": instruction,
"input": input,
"output": output
}
data_list.append(res_dic)

# 构建负样本
positive_sample_num = len(data_list)
item_set = item_dict.keys()
for i in range(positive_sample_num):
negative_sample_pair = random.sample(item_set, 2) # [1, 2]
a_dict = {
"title": item_dict[negative_sample_pair[0]]['title'],
"brand": item_dict[negative_sample_pair[0]]['brand'],
"price": item_dict[negative_sample_pair[0]]['price'],
"description": item_dict[negative_sample_pair[0]]['description']
}
b_dict = {
"title": item_dict[negative_sample_pair[1]]['title'],
"brand": item_dict[negative_sample_pair[1]]['brand'],
"price": item_dict[negative_sample_pair[1]]['price'],
"description": item_dict[negative_sample_pair[1]]['description']
}
negative_sample_pair = [a_dict, b_dict]
formatted_input = json.dumps(negative_sample_pair, indent=4, ensure_ascii=False)
input = ("I will provide you with two product related introduction information, as follows(in JSON " +
"format):\n\n" +
formatted_input + "\n\n" +
"Based on above information, please predict if these two products are similar. The similarity " +
"is between 0 and 2, 0 being lowest and 2 being highest. You just need to ranking the above " +
"product, do not explain the reason.")
res_dic = {
"instruction": instruction,
"input": input,
"output": "0"
}
data_list.append(res_dic)

# 将数据拆分为训练集和测试集
random.shuffle(data_list)
split_loc = int(len(data_list) * test_ratio)
test_data_list = data_list[0: split_loc]
train_data_list = data_list[split_loc:]
test_res = json.dumps(test_data_list, indent=4, ensure_ascii=False)
train_res = json.dumps(train_data_list, indent=4, ensure_ascii=False)
with open(out_path + "/test.json", 'a') as file_: # 将生成的训练数据保存起来
file_.write(test_res)
with open(out_path + "/train.json", 'a') as file_: # 将生成的训练数据保存起来
file_.write(train_res)


from generate_item_dict import get_metadata_dict

item_dict = get_metadata_dict()
generate_data("../data", item_dict, 0.3)

"""
目前train.json 4616个样本。
目前test.json 3706个样本。
"""

下面为每个商品生成相关的信息字典。
代码similar_rec/data-process/generate_item_dict.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import csv
import json


def get_metadata_dict(path: str = '../../data/amazon_review/beauty/meta_All_Beauty.json') -> dict:
item_dict = {} # meta_All_Beauty.json中获取每个item对应的信息
"""
{"category": [], "tech1": "", "description": ["Start Up combines citrus essential oils with gentle Alpha Hydroxy Acids to cleanse and refresh your face. The 5% AHA level is gentle enough for all skin types.", "", ""],
"fit": "", "title": "Kiss My Face Exfoliating Face Wash Start Up, 4 Fluid Ounce",
"also_buy": ["B000Z96JDI", "B00006IGL8", "B007C5X34G", "B00006IGLF", "B00213WCNC", "B00D1W1QXE", "B001FB5HZG", "B000FQ86RI", "B0012BSKBM", "B0085EVLRO", "B00A2EXVQE"],
"tech2": "", "brand": "Kiss My Face", "feature": [], "rank": [],
"also_view": ["B000Z96JDI", "B001FB5HZG", "B00213WCNC", "B00BBFOVO4", "B0085EVLRO"],
"details": {"\n Product Dimensions: \n ": "2.5 x 1.6 x 7 inches ; 4 ounces", "Shipping Weight:": "4 ounces", "ASIN: ": "B00006IGL2", "UPC:": "890795851488 701320351987 601669038184 793379218755 028367831938 787734768894 756769626417", "Item model number:": "1200040"},
"main_cat": "All Beauty", "similar_item": "", "date": "", "price": "",
"asin": "B00006IGL2", "imageURL": ["https://images-na.ssl-images-amazon.com/images/I/41i07fBAznL._SS40_.jpg",
"https://images-na.ssl-images-amazon.com/images/I/31W8DZRVD1L._SS40_.jpg"], "imageURLHighRes": ["https://images-na.ssl-images-amazon.com/images/I/41i07fBAznL.jpg", "https://images-na.ssl-images-amazon.com/images/I/31W8DZRVD1L.jpg"]}

"""
with open(path, 'r') as file:
reader = csv.reader(file, delimiter='\n')
for row in reader:
j = json.loads(row[0])
item_id = j['asin']
title = j['title']
brand = j['brand']
description = j['description']
price = j['price']
also_buy = j['also_buy']
also_view = j['also_view']
if price != "" and '$' not in price and len(price) > 10: # 处理一些异常数据情况
price = ""
item_info = {
"title": title,
"brand": brand,
"description": description,
"also_buy": also_buy,
"also_view": also_view,
"price": price
}
item_dict[item_id] = item_info
return item_dict

5.2 多路召回

下面是基于:

  1. 标签 brand
  2. 同时购买also_buy
  3. 同时浏览also_view
  4. 嵌入相似
    四个召回算法的。

代码similar_rec/recall_items.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import importlib
import operator
import sys

from sentence_transformers import SentenceTransformer

sys.path.append('./data-process')
generate_item_dict = importlib.import_module('generate_item_dict')


def tags_recall(item_id: str, item_dict: dict) -> [str]:
"""
基于商品的标签召回,本算法利用brand进行召回,召回的item是brand跟item_id一样的商品
:param item_dict: 商品metadata的字典信息
:param item_id: 商品id
:return: 召回的item列表
"""
brand = item_dict[item_id]['brand']
recall_list = []
for key, value in item_dict.items():
if value['brand'] == brand and key != item_id:
recall_list.append(key)
return recall_list


def embedding_recall(item_id: str,
item_dict: dict,
recall_num: int = 20,
min_similar_score: float = 0.8) -> [str]:
"""
利用商品的文本数据进行嵌入,利用嵌入向量召回
:param min_similar_score: 最底的相似得分,大于这个得分就可以做为召回了
:param recall_num: 召回的数量,默认是20个
:param item_dict: 商品metadata的字典信息
:param item_id: 商品id
:return: 召回的item列表
本函数只是一个方法式例,的实现效率不是很高,更好的实现方式是提前将所有商品的embedding计算出来并且放到faiss库(或者其它向量库)中,
这样可以获得毫秒级的召回效率
"""
model = SentenceTransformer('/Users/liuqiang/Desktop/code/llm/models/bge-large-en-v1.5')
item_title = item_dict[item_id]['title']
item_desc = item_dict[item_id]['description'][0]
item_info = "title: " + item_title + "\n" + "description: " + item_desc
sentences_1 = [item_info]
embeddings_1 = model.encode(sentences_1, normalize_embeddings=True)
similar_list = []
for key, value in item_dict.items():
if len(similar_list) < recall_num and key != item_id and value['description']:
title = value['title']
desc = value['description'][0]
info = "title: " + title + "\n" + "description: " + desc
sentences_2 = [info]
embeddings_2 = model.encode(sentences_2, normalize_embeddings=True)
similarity = embeddings_1 @ embeddings_2.T
if similarity[0][0] > min_similar_score:
similar_list.append((key, similarity[0][0]))
similar_list.sort(key=operator.itemgetter(1), reverse=True)
slice_list = similar_list[0: recall_num]
return [x[0] for x in slice_list]


def also_buy_recall(item_id: str, item_dict: dict) -> [str]:
"""
亚马逊电商数据集中商品metadata中包含also_buy字段,这个字段就是跟该商品一起买的商品,可以做为召回
:param item_dict: 商品metadata的字典信息
:param item_id: 商品id
:return: 召回的item列表
"""
also_buy_list = item_dict[item_id]['also_buy']
return also_buy_list


def also_view_recall(item_id: str, item_dict: dict) -> [str]:
"""
亚马逊电商数据集中商品metadata中包含also_view字段,这个字段就是跟该商品一起被用户浏览的商品,可以做为召回来源
:param item_dict: 商品metadata的字典信息
:param item_id: 商品id
:return: 召回的item列表
"""
also_view_list = item_dict[item_id]['also_view']
return also_view_list


if __name__ == "__main__":
dic = generate_item_dict.get_metadata_dict()
print("----------")
print(embedding_recall("B00006IGL2", dic, 10, 0.75))
print("----------")

5.3 相似度排序

两种方法:

  1. 基于 rerank 模型的排序
  2. 基于微调后的大模型的排序

代码similar_rec/similar_ranking.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import importlib
import json
import operator
import sys

import torch
from sentence_transformers import CrossEncoder
from transformers import AutoTokenizer, AutoModelForCausalLM

sys.path.append('./data-process')
generate_item_dict = importlib.import_module('generate_item_dict')


def cross_encoder_rerank(item_id: str,
recall_list: [[str]],
item_dict: dict,
top_n: int = 10) -> [dict]:
"""

:param item_id: 待推荐的商品id,我们会给该商品关联相关的商品做为相似推荐
:param recall_list: 召回的商品列表
:param item_dict: 商品信息字典
:param top_n: 最终相似的商品的数量,默认值为10
:return: 最终排序后的相似结果
"""
all_recall_items = set()
for lst in recall_list:
all_recall_items = all_recall_items.union(set(lst))
model = CrossEncoder(model_name='/Users/liuqiang/Desktop/code/llm/models/bge-reranker-large',
max_length=512, device="mps")

item_title = item_dict[item_id]['title']
item_desc = item_dict[item_id]['description'][0]
item_info = "title: " + item_title + "\n" + "description: " + item_desc
sentence_list = []
item_list = []
for item in all_recall_items:
if item in item_dict:
title = item_dict[item]['title']
desc = item_dict[item]['description'][0]
info = "title: " + title + "\n" + "description: " + desc
sentence_list.append(info)
item_list.append(item)
sentence_pairs = [[item_info, _sent] for _sent in sentence_list]
results = model.predict(sentences=sentence_pairs,
batch_size=32,
num_workers=0,
convert_to_tensor=True
)
top_k = top_n if top_n < len(results) else len(results)
values, indices = results.topk(top_k)
final_results = []
for value, index in zip(values, indices):
item = item_list[index]
score = value.item()
doc = {
"item": item,
"score": score
}
final_results.append(doc)
return final_results


def llm_rerank(item_id: str,
recall_list: [[str]],
item_dict: dict,
top_n: int = 10,
model_path: str = './models') -> [dict]:
"""

:param model_path: 预训练好的模型的存储路径
:param item_id: 待推荐的商品id,我们会给该商品关联相关的商品做为相似推荐
:param recall_list: 召回的商品列表
:param item_dict: 商品信息字典
:param top_n: 最终相似的商品的数量,默认值为10
:return: 最终排序后的相似结果
"""
print(model_path)
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.padding_side = 'right'

instruction = ("You are a product expert who judges whether "
"two products are similar based on your professional knowledge.")

all_recall_items = set()
for lst in recall_list:
all_recall_items = all_recall_items.union(set(lst))

a_dict = {
"title": item_dict[item_id]['title'],
"brand": item_dict[item_id]['brand'],
"price": item_dict[item_id]['price'],
"description": item_dict[item_id]['description']
}

results = []
for item in all_recall_items:
if item in item_dict:
b_dict = {
"title": item_dict[item]['title'],
"brand": item_dict[item]['brand'],
"price": item_dict[item]['price'],
"description": item_dict[item]['description']
}
sample_pair = [a_dict, b_dict]
formatted_input = json.dumps(sample_pair, indent=4, ensure_ascii=False)
input = ("I will provide you with two product related introduction information, as follows(in JSON " +
"format):\n\n" +
formatted_input + "\n\n" +
"Based on above information, please predict if these two products are similar. The similarity " +
"is between 0 and 1, 0 being lowest and 1 being highest. You just need to ranking the above " +
"product, do not explain the reason.")
prompt = f"""### Instruction:
{instruction}

### Input:
{input}

### Response:
"""
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
outputs = model.generate(input_ids=input_ids.to('mps'),
max_new_tokens=500, pad_token_id=tokenizer.eos_token_id)
predict_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0][len(prompt):]
doc = {
"item": item,
"score": float(predict_output)
}
results.append(doc)

sorted_list = sorted(results, key=operator.itemgetter('score'), reverse=True)
return sorted_list[:top_n]


if __name__ == "__main__":
embedding_recall = ['B000052YPP', 'B00005308B', 'B0000530HZ', 'B000052YD8', 'B00005308M', 'B000052YMO',
'9790787006', '6546546450', '9744914572', '7414204790']
also_view_recall = ['B000Z96JDI', 'B001FB5HZG', 'B00213WCNC', 'B00BBFOVO4', 'B0085EVLRO']
also_buy_recall = ['B000Z96JDI', 'B00006IGL8', 'B007C5X34G', 'B00006IGLF', 'B00213WCNC', 'B00D1W1QXE', 'B001FB5HZG',
'B000FQ86RI', 'B0012BSKBM', 'B0085EVLRO', 'B00A2EXVQE']
band_recall = ['B00028EYZW', 'B001E0T0HE', 'B00FTBJ6HI', 'B00KLDU08S', 'B00OQQWU4I']
dic = generate_item_dict.get_metadata_dict('../data/amazon_review/beauty/meta_All_Beauty.json')

# print("----------")
# res = cross_encoder_rerank("B00006IGL2",
# [embedding_recall, also_buy_recall, also_view_recall, band_recall],
# item_dict, 10)
# print(json.dumps(res, indent=4, ensure_ascii=False))
# print("----------")

print("----------")
res = llm_rerank("B00006IGL2",
[embedding_recall, also_buy_recall, also_view_recall, band_recall],
dic, 10, './models')
print(json.dumps(res, indent=4, ensure_ascii=False))
print("----------")

5.4 效果评估

预测相似度(0-2),利用 RMSE 指标评估
代码similar_rec/evaluate.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import json
import math

import fire
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM


def is_valid_json(text):
try:
json.loads(text)
return True
except json.JSONDecodeError:
return False


def is_float(s):
try:
float(s)
return True
except ValueError:
return False


def rmse(_true, _predict):
return math.sqrt(math.fabs(_true - _predict))


def output_format(output) -> float:
"""
:param output: 大模型的输出
:return: int
没有微调过的大模型输出的可能不是按照规范的,需要获得大模型对应的输出,下面是4个大模型的输出案例。
1. The similarity between these two products is 0.5.
2. The similarity between the two products is 0.5. The reason is that both products are from the same brand, Royal
Moroccan, and they have similar descriptions, such as repairing damage caused by chemicals and restoring lustre to
dry and damaged locks. However, the price and capacity of the two products are different, which may affect the similarity score.
3. 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
4. [
{
"product\_1": "Shiseido Pureness Moisturizing Gel (Oil Free) 50ml/1.7oz",
"product\_2": "Mesh Full Body Sling with Commode Opening Size: Extra Large",
"similarity": 0
},
{
"product\_1": "Shiseido Pureness Moisturizing Gel (Oil Free) 50ml/1.7oz",
"product\_2": "Mesh Full Body Sling with Commode Opening Size: Large",
"similarity": 0
}
]
"""
output = output[:1000] # 只取前面的1000个字符,后面如果有生成额外的不考虑
if is_float(output[0:2]):
return float(output[0:2])
string_1 = 'The similarity between the two products is '
string_2 = 'The similarity between these two products is '
string_3 = '"similarity": '
index_1 = output.find(string_1) # 如果找不到,返回-1
index_2 = output.find(string_2) # 如果找不到,返回-1
index_3 = output.find(string_3) # 如果找不到,返回-1
if index_1 > -1:
if is_float(output[index_1 + len(string_1):index_1 + len(string_1) + 2]):
score = float(output[index_1 + len(string_1):index_1 + len(string_1) + 2])
return score
else:
return -1
elif index_2 > -1:
if is_float(output[index_2 + len(string_2):index_2 + len(string_2) + 2]):
score = float(output[index_2 + len(string_2):index_2 + len(string_2) + 2])
return score
else:
return -1
elif index_3 > -1:
if is_float(output[index_3 + len(string_3):index_3 + len(string_3) + 2]):
score = float(output[index_3 + len(string_3):index_3 + len(string_3) + 2])
return score
else:
return -1
else:
return -1


def load_model_token(model_path: str) -> (AutoModelForCausalLM, AutoTokenizer):
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.padding_side = 'right' # to prevent warnings
return model, tokenizer


def evaluate(model_path: str,
test_data_path: str = './data',
keep_sample_num: int = 10) -> float:
model, tokenizer = load_model_token(model_path)

dataset_dict = load_dataset(test_data_path)
test_dataset = dataset_dict['test'][0:keep_sample_num]

acc_rmse = 0.0 # 累积误差
acc_num = 0 # 累积的参与统计的样本数量

for i in range(keep_sample_num):
prompt = f"""### Instruction:
{test_dataset['instruction'][i]}

### Input:
{test_dataset['input'][i]}

### Response:
"""
gold_output = float(test_dataset['output'][i])
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
outputs = model.generate(input_ids=input_ids.to('mps'),
max_new_tokens=500, pad_token_id=tokenizer.eos_token_id)
predict_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0][len(prompt):]
print("--------sample: " + str(i) + "--------")
print("预测的输出: " + predict_output + "\n")
score = output_format(predict_output)
print("预测的score: " + str(score) + "\n")
if score == -1: # 生成的比较复杂,没有解析到对应的预测评分
continue
else:
acc_num += 1
predict_output = score
rmse_ = rmse(gold_output, predict_output)
acc_rmse += rmse_
dic = { # 将每一个样本的评估结果打印出来
"sample": i,
"input": test_dataset['input'][i],
"gold_output": gold_output,
"predict_output": predict_output,
"rmse": rmse_
}
print(json.dumps(dic, indent=4, ensure_ascii=False))
print("----------------")

return acc_rmse / acc_num


def effect_comparison(base_model_path: str = '/Users/liuqiang/Desktop/code/llm/models/Qwen1.5-4B',
finetune_model_path: str = './models',
test_data_path: str = './data',
keep_sample_num: int = 10):
avg_base_rmse = evaluate(base_model_path, test_data_path, keep_sample_num)
avg_finetune_rmse = evaluate(finetune_model_path, test_data_path, keep_sample_num)

print("基底模型的平均rmse:" + str(avg_base_rmse))
print("微调模型的平均rmse:" + str(avg_finetune_rmse))


if __name__ == "__main__":
fire.Fire(effect_comparison)

6、推荐解释

7、对话式推荐