阅读书籍: 《大模型推荐系统 | 算法原理、代码实战与案例分析》刘强
数据集来源:

注:大型文件包工具git-lfs、强化学习训练工具trl

微调范式

微调大模型进行个性化推荐

1、微调的方法

  • 指令微调
  • RLHF

2、案例

2.1 TALLRec 微调框架

  1. 指令数据模版的构建
  2. 二次微调的原理(指令微调、推荐微调)

2.2 GIRL:基于人类反馈的微调框架

  • 三种 JD 推荐范式
    1. 常规的工作推荐
    2. 生成式工作推荐
    3. 生成增强的工作推荐
  • 基于 RLHF 的 JD 生成框架
    1. 监督微调
    2. 奖励模型训练
    3. PPO 更新参数
  • 生成加强的 JD 推荐方案

3、基于 MIND 数据集实现微调

3.1 数据准备

转换得到 LoRA微调所需的格式:
文件generate_json_data.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import csv
import json
from enum import Enum


class Action(Enum):
YES = "Yes."
NO = "No."


"""
按照如下格式构建训练数据集
{"instruction": "Given the user's preference and unpreference, identify whether the user will like the target movie by
answering \"Yes.\" or \"No.\".",
"input": "User Preference: \"Opinion: Colin Kaepernick is about to get what he deserves: a chance\"\nUser Unpreference:
\"Browns apologize to Mason Rudolph, call Myles Garrett's actions 'unacceptable'\",
\"I've been writing about tiny homes for a year and finally spent 2 nights in a 300-foot home to see what it's all about
here's how it went\",\"The Kardashians Face Backlash Over 'Insensitive' Family Food Fight in KUWTK Clip\",
\"THEN AND NOW: What all your favorite '90s stars are doing today\",\"Report: Police investigating woman's death after
Redskins' player Montae Nicholson took her to hospital\",\"U.S. Troops Will Die If They Remain in Syria,
Bashar Al-Assad Warns\",\"3 Indiana judges suspended after a night of drinking turned into a White Castle brawl\",
\"Cows swept away by Hurricane Dorian found alive but how?\",\"Surviving Santa Clarita school shooting victims on
road to recovery: Latest\",\"The Unlikely Star of My Family's Thanksgiving Table\",\"Meghan Markle and Hillary Clinton
Secretly Spent the Afternoon Together at Frogmore Cottage\",\"Former North Carolina State, NBA player Anthony Grundy
dies in stabbing, police say\",\"85 Thanksgiving Recipes You Can Make Ahead\",\"Survivor Contestants Missy Byrd and
Elizabeth Beisel Apologize For Their Actions\",\"Pete Davidson, Kaia Gerber Are Dating, Trying to Stay 'Low Profile'\"
,\"There's a place in the US where its been over 80 degrees since March\",\"Taylor Swift Rep Hits Back at Big Machine,
Claims She's Actually Owed $7.9 Million in Unpaid Royalties\",\"The most talked about movie moments of the 2010s\",
\"Belichick mocks social media in comments on Garrett incident\",\"13 Reasons Why's Christian Navarro Slams Disney
for Casting 'the White Guy' in The Little Mermaid\"\nWhether the user will like the targe news \"66 Cool Tech Gifts
Anyone Would Be Thrilled to Receive\"?",
"output": "No."},
...
]
"""

instruction = ("Given the user's preference and unpreference, identify whether the user will like the target movie by "
"answering \"Yes.\" or \"No.\".")

news_dict = {} # 从news.tsv获取每个新闻id到标题的映射字典。
with open('../data/mind/news.tsv', 'r') as file:
reader = csv.reader(file, delimiter='\t')
for row in reader:
news_id = row[0]
news_title = row[3]
news_dict[news_id] = news_title

data_list = [] # 利用behaviors.tsv数据获取用户喜欢和不喜欢的新闻
with open('../data/mind/behaviors.tsv', 'r') as file:
reader = csv.reader(file, delimiter='\t')
for row in reader:
impression = row[4]
impre_list = impression.split(" ")
if len(impre_list) >= 5: # 用户至少要有5个曝光历史
preference = []
unpreference = []
for impre in impre_list[:-1]: # 利用前面的新闻做为训练数据,最后一个新闻做为预测。
[impre_id, action] = impre.split("-")
title = news_dict[impre_id]
if int(action) == 1:
preference.append("\"" + title + "\"")
else:
unpreference.append("\"" + title + "\"")
input = "User Preference: " + ','.join(preference) + "\n" + "User Unpreference: " + ','.join(unpreference)
[impre_id, action] = impre_list[-1].split("-")
output = Action.YES.value if int(action) == 1 else Action.NO.value
input = input + "\n" + "Whether the user will like the targe news " + "\"" + news_dict[impre_id] + "\"?"
res_dic = {
"instruction": instruction,
"input": input,
"output": output
}
data_list.append(res_dic)

res = json.dumps(data_list, indent=4, ensure_ascii=False)
user_sequence_path = "../data/mind/train.json" # 将生成的训练数据保存起来
with open(user_sequence_path, 'a') as file:
file.write(res)

3.2 模型微调

有多种方法:

  1. 基于alpaca-lora框架进行微调
  2. 基于苹果 MLX 框架进行微调
  3. 基于 LLaMA-Factory 框架进行微调
  4. 基于 Transformers 库的 run_clm.py 进行微调

下面代码是方法 1 的微调代码:
文件finetune_tallrec.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
import os
from typing import List

import fire
import torch
from datasets import load_dataset
from peft import (
LoraConfig,
get_peft_model,
prepare_model_for_int8_training,
set_peft_model_state_dict,
)
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, DataCollatorForSeq2Seq, TrainingArguments

from utils.prompter import Prompter


def train(
# model/data params
base_model: str = "", # the only required argument
data_path: str = "./data/mind/train.json",
output_dir: str = "./lora-alpaca",
# training hyperparams
batch_size: int = 128,
micro_batch_size: int = 4,
num_epochs: int = 3,
learning_rate: float = 3e-4,
cutoff_len: int = 256,
val_set_size: int = 2000,
# lora hyperparams
lora_r: int = 8,
lora_alpha: int = 16,
lora_dropout: float = 0.05,
lora_target_modules: List[str] = ["q_proj", "v_proj"],
# llm hyperparams
train_on_inputs: bool = True, # if False, masks out inputs in loss
add_eos_token: bool = False,
group_by_length: bool = False, # faster, but produces an odd training loss curve
# wandb params
wandb_project: str = "",
wandb_run_name: str = "",
wandb_watch: str = "", # options: false | gradients | all
wandb_log_model: str = "", # options: false | true
resume_from_checkpoint: str = None, # either training checkpoint or final adapter
prompt_template_name: str = "alpaca", # The prompt template to use, will default to alpaca.
):
assert (
base_model
), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"
gradient_accumulation_steps = batch_size // micro_batch_size

prompter = Prompter(prompt_template_name)

device_map = "auto"
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1
if ddp:
device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
gradient_accumulation_steps = gradient_accumulation_steps // world_size

# Check if parameter passed or if set within environ
use_wandb = len(wandb_project) > 0 or (
"WANDB_PROJECT" in os.environ and len(os.environ["WANDB_PROJECT"]) > 0
)
# Only overwrite environ if wandb param passed
if len(wandb_project) > 0:
os.environ["WANDB_PROJECT"] = wandb_project
if len(wandb_watch) > 0:
os.environ["WANDB_WATCH"] = wandb_watch
if len(wandb_log_model) > 0:
os.environ["WANDB_LOG_MODEL"] = wandb_log_model

model = LlamaForCausalLM.from_pretrained(
base_model,
torch_dtype=torch.float16,
device_map=device_map,
)

tokenizer = LlamaTokenizer.from_pretrained(base_model)

tokenizer.pad_token_id = (
0 # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left" # Allow batched inference

def tokenize(prompt, add_eos_token=True):
# there's probably a way to do this with the tokenizer settings
# but again, gotta move fast
result = tokenizer(
prompt,
truncation=True,
max_length=cutoff_len,
padding=False,
return_tensors=None,
)
if (
result["input_ids"][-1] != tokenizer.eos_token_id
and len(result["input_ids"]) < cutoff_len
and add_eos_token
):
result["input_ids"].append(tokenizer.eos_token_id)
result["attention_mask"].append(1)

result["labels"] = result["input_ids"].copy()

return result

def generate_and_tokenize_prompt(data_point):
full_prompt = prompter.generate_prompt(
data_point["instruction"],
data_point["input"],
data_point["output"],
)
tokenized_full_prompt = tokenize(full_prompt)
if not train_on_inputs:
user_prompt = prompter.generate_prompt(
data_point["instruction"], data_point["input"]
)
tokenized_user_prompt = tokenize(
user_prompt, add_eos_token=add_eos_token
)
user_prompt_len = len(tokenized_user_prompt["input_ids"])

if add_eos_token:
user_prompt_len -= 1

tokenized_full_prompt["labels"] = [
-100
] * user_prompt_len + tokenized_full_prompt["labels"][
user_prompt_len:
] # could be sped up, probably
return tokenized_full_prompt

model = prepare_model_for_int8_training(model)

config = LoraConfig(
r=lora_r,
lora_alpha=lora_alpha,
target_modules=lora_target_modules,
lora_dropout=lora_dropout,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

if data_path.endswith(".json") or data_path.endswith(".jsonl"): # 数据是JSON或者JSONL格式
data = load_dataset("json", data_files=data_path)
else:
data = load_dataset(data_path)

if resume_from_checkpoint:
# Check the available weights and load them
checkpoint_name = os.path.join(
resume_from_checkpoint, "pytorch_model.bin"
) # Full checkpoint
if not os.path.exists(checkpoint_name):
checkpoint_name = os.path.join(
resume_from_checkpoint, "adapter_model.bin"
) # only LoRA model - LoRA config above has to fit
resume_from_checkpoint = (
False # So the trainer won't try loading its state
)
# The two files above have a different name depending on how they were saved, but are actually the same.
if os.path.exists(checkpoint_name):
print(f"Restarting from {checkpoint_name}")
adapters_weights = torch.load(checkpoint_name)
set_peft_model_state_dict(model, adapters_weights)
else:
print(f"Checkpoint {checkpoint_name} not found")

model.print_trainable_parameters() # Be more transparent about the % of trainable params.

if val_set_size > 0:
train_val = data["train"].train_test_split(
test_size=val_set_size, shuffle=True, seed=42
)
train_data = (
train_val["train"].shuffle().map(generate_and_tokenize_prompt)
)
val_data = (
train_val["test"].shuffle().map(generate_and_tokenize_prompt)
)
else:
train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
val_data = None

if not ddp and torch.cuda.device_count() > 1:
# keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
model.is_parallelizable = True
model.model_parallel = True

trainer = Trainer(
model=model,
train_dataset=train_data,
eval_dataset=val_data,
args=TrainingArguments(
per_device_train_batch_size=micro_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
warmup_steps=100,
num_train_epochs=num_epochs,
learning_rate=learning_rate,
logging_steps=10,
optim="adamw_torch",
evaluation_strategy="steps" if val_set_size > 0 else "no",
save_strategy="steps",
eval_steps=200 if val_set_size > 0 else None,
save_steps=200,
output_dir=output_dir,
save_total_limit=3,
load_best_model_at_end=True if val_set_size > 0 else False,
ddp_find_unused_parameters=False if ddp else None,
group_by_length=group_by_length,
report_to="wandb" if use_wandb else None,
run_name=wandb_run_name if use_wandb else None,
),
data_collator=DataCollatorForSeq2Seq(
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
),
)
model.config.use_cache = False

trainer.train(resume_from_checkpoint=resume_from_checkpoint)

model.save_pretrained(output_dir)


if __name__ == "__main__":
fire.Fire(train)

文件finetune_tallrec.sh

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

python finetune_tallrec.py \
--base_model '/Users/liuqiang/Desktop/code/llm/models/chinese-alpaca-2-7b' \
--data_path '/Users/liuqiang/Desktop/code/llm4rec/llm4rec_abc/src/basic_skills/finetune-llm/data/mind/train.json' \
--output_dir './lora-weights' \
--batch_size 128 \
--micro_batch_size 4 \
--num_epochs 1 \
--learning_rate 5e-4 \
--cutoff_len 512 \
--val_set_size 10000 \
--lora_r 8 \
--lora_alpha 16 \
--lora_dropout 0.05 \
--lora_target_modules '[q_proj,v_proj]' \
--train_on_inputs \
--group_by_length

3.3 模型推断

微调后,进行推断,检验效果是否更加精准。
文件infer_tallrec.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import os
import sys

import fire
import gradio as gr
import torch
import transformers
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer

from utils.callbacks import Iteratorize, Stream
from utils.prompter import Prompter


def main(
load_8bit: bool = False,
base_model: str = "",
lora_weights: str = "tloen/alpaca-lora-7b",
prompt_template: str = "", # The prompt template to use, will default to alpaca.
server_name: str = "0.0.0.0", # Allows to listen on all interfaces by providing '0.
share_gradio: bool = False,
):
base_model = base_model or os.environ.get("BASE_MODEL", "")
assert (
base_model
), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"

prompter = Prompter(prompt_template)
tokenizer = LlamaTokenizer.from_pretrained(base_model)

device_map = "mps"

model = LlamaForCausalLM.from_pretrained(
base_model,
device_map=device_map,
torch_dtype=torch.float16,
)
model = PeftModel.from_pretrained(
model,
lora_weights,
device_map=device_map,
torch_dtype=torch.float16,
)

# unwind broken decapoda-research config
model.config.pad_token_id = tokenizer.pad_token_id = 0 # unk
model.config.bos_token_id = 1
model.config.eos_token_id = 2

if not load_8bit:
model.half() # seems to fix bugs for some users.

model.eval()
if torch.__version__ >= "2" and sys.platform != "win32":
model = torch.compile(model)

def evaluate(
instruction,
input=None,
temperature=0.1,
top_p=0.75,
top_k=40,
num_beams=4,
max_new_tokens=128,
stream_output=False,
**kwargs,
):
prompt = prompter.generate_prompt(instruction, input)
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device_map)
generation_config = GenerationConfig(
temperature=temperature,
top_p=top_p,
top_k=top_k,
num_beams=num_beams,
**kwargs,
)

generate_params = {
"input_ids": input_ids,
"generation_config": generation_config,
"return_dict_in_generate": True,
"output_scores": True,
"max_new_tokens": max_new_tokens,
}

if stream_output:
# Stream the reply 1 token at a time.
# This is based on the trick of using 'stopping_criteria' to create an iterator,
# from https://github.com/oobabooga/text-generation-webui/blob/ad37f396fc8bcbab90e11ecf17c56c97bfbd4a9c/modules/text_generation.py#L216-L243.
def generate_with_callback(callback=None, **kwargs):
kwargs.setdefault(
"stopping_criteria", transformers.StoppingCriteriaList()
)
kwargs["stopping_criteria"].append(
Stream(callback_func=callback)
)
with torch.no_grad():
model.generate(**kwargs)

def generate_with_streaming(**kwargs):
return Iteratorize(
generate_with_callback, kwargs, callback=None
)

with generate_with_streaming(**generate_params) as generator:
for output in generator:
# new_tokens = len(output) - len(input_ids[0])
decoded_output = tokenizer.decode(output)

if output[-1] in [tokenizer.eos_token_id]:
break

yield prompter.get_response(decoded_output)
return # early return for stream_output

# Without streaming
with torch.no_grad():
generation_output = model.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_new_tokens=max_new_tokens,
)
s = generation_output.sequences[0]
output = tokenizer.decode(s)
yield prompter.get_response(output)

gr.Interface(
fn=evaluate,
inputs=[
gr.components.Textbox(
lines=2,
label="Instruction",
placeholder="Tell me about alpacas.",
),
gr.components.Textbox(lines=2, label="Input", placeholder="none"),
gr.components.Slider(
minimum=0, maximum=1, value=0.1, label="Temperature"
),
gr.components.Slider(
minimum=0, maximum=1, value=0.75, label="Top p"
),
gr.components.Slider(
minimum=0, maximum=100, step=1, value=40, label="Top k"
),
gr.components.Slider(
minimum=1, maximum=4, step=1, value=4, label="Beams"
),
gr.components.Slider(
minimum=1, maximum=2000, step=1, value=128, label="Max tokens"
),
gr.components.Checkbox(label="Stream output"),
],
outputs=[
gr.components.Textbox(
lines=5,
label="Output",
)
],
title="🦙🌲 Alpaca-LoRA",
description="Alpaca-LoRA is a 7B-parameter LLaMA model finetuned to follow instructions. It is trained on the [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) dataset and makes use of the Huggingface LLaMA implementation. For more information, please visit [the project's website](https://github.com/tloen/alpaca-lora).",
# noqa: E501
).queue().launch(server_name=server_name, share=share_gradio)


if __name__ == "__main__":
fire.Fire(main)

文件infer_tallrec.sh

1
2
3
4
python infer_tallrec.py \
--base_model '/Users/liuqiang/Desktop/code/llm/models/chinese-alpaca-2-7b' \
--lora_weights '/Users/liuqiang/Desktop/code/llm4rec/llm4rec_abc/src/finetune-llm/lora-weights'