微调范式

微调大模型进行个性化推荐

1、微调的方法

指令微调
RLHF

2、案例

2.1 TALLRec 微调框架

指令数据模版的构建
二次微调的原理（指令微调、推荐微调）

2.2 GIRL：基于人类反馈的微调框架

三种 JD 推荐范式
1. 常规的工作推荐
2. 生成式工作推荐
3. 生成增强的工作推荐
基于 RLHF 的 JD 生成框架
1. 监督微调
2. 奖励模型训练
3. PPO 更新参数
生成加强的 JD 推荐方案

3、基于 MIND 数据集实现微调

3.1 数据准备

转换得到 LoRA微调所需的格式：
文件generate_json_data.py

import csv
import json
from enum import Enum


class Action(Enum):
    YES = "Yes."
    NO = "No."


"""
按照如下格式构建训练数据集
{"instruction": "Given the user's preference and unpreference, identify whether the user will like the target movie by 
answering \"Yes.\" or \"No.\".", 
"input": "User Preference: \"Opinion: Colin Kaepernick is about to get what he deserves: a chance\"\nUser Unpreference: 
\"Browns apologize to Mason Rudolph, call Myles Garrett's actions 'unacceptable'\",
\"I've been writing about tiny homes for a year and finally spent 2 nights in a 300-foot home to see what it's all about 
  here's how it went\",\"The Kardashians Face Backlash Over 'Insensitive' Family Food Fight in KUWTK Clip\",
  \"THEN AND NOW: What all your favorite '90s stars are doing today\",\"Report: Police investigating woman's death after 
  Redskins' player Montae Nicholson took her to hospital\",\"U.S. Troops Will Die If They Remain in Syria, 
  Bashar Al-Assad Warns\",\"3 Indiana judges suspended after a night of drinking turned into a White Castle brawl\",
  \"Cows swept away by Hurricane Dorian found alive   but how?\",\"Surviving Santa Clarita school shooting victims on 
  road to recovery: Latest\",\"The Unlikely Star of My Family's Thanksgiving Table\",\"Meghan Markle and Hillary Clinton
   Secretly Spent the Afternoon Together at Frogmore Cottage\",\"Former North Carolina State, NBA player Anthony Grundy 
   dies in stabbing, police say\",\"85 Thanksgiving Recipes You Can Make Ahead\",\"Survivor Contestants Missy Byrd and 
   Elizabeth Beisel Apologize For Their Actions\",\"Pete Davidson, Kaia Gerber Are Dating, Trying to Stay 'Low Profile'\"
   ,\"There's a place in the US where its been over 80 degrees since March\",\"Taylor Swift Rep Hits Back at Big Machine,
    Claims She's Actually Owed $7.9 Million in Unpaid Royalties\",\"The most talked about movie moments of the 2010s\",
    \"Belichick mocks social media in comments on Garrett incident\",\"13 Reasons Why's Christian Navarro Slams Disney 
    for Casting 'the White Guy' in The Little Mermaid\"\nWhether the user will like the targe news \"66 Cool Tech Gifts 
    Anyone Would Be Thrilled to Receive\"?", 
    "output": "No."},
...
]
"""

instruction = ("Given the user's preference and unpreference, identify whether the user will like the target movie by "
               "answering \"Yes.\" or \"No.\".")

news_dict = {}  # 从news.tsv获取每个新闻id到标题的映射字典。
with open('../data/mind/news.tsv', 'r') as file:
    reader = csv.reader(file, delimiter='\t')
    for row in reader:
        news_id = row[0]
        news_title = row[3]
        news_dict[news_id] = news_title

data_list = []  # 利用behaviors.tsv数据获取用户喜欢和不喜欢的新闻
with open('../data/mind/behaviors.tsv', 'r') as file:
    reader = csv.reader(file, delimiter='\t')
    for row in reader:
        impression = row[4]
        impre_list = impression.split(" ")
        if len(impre_list) >= 5:  # 用户至少要有5个曝光历史
            preference = []
            unpreference = []
            for impre in impre_list[:-1]:  # 利用前面的新闻做为训练数据，最后一个新闻做为预测。
                [impre_id, action] = impre.split("-")
                title = news_dict[impre_id]
                if int(action) == 1:
                    preference.append("\"" + title + "\"")
                else:
                    unpreference.append("\"" + title + "\"")
            input = "User Preference: " + ','.join(preference) + "\n" + "User Unpreference: " + ','.join(unpreference)
            [impre_id, action] = impre_list[-1].split("-")
            output = Action.YES.value if int(action) == 1 else Action.NO.value
            input = input + "\n" + "Whether the user will like the targe news " + "\"" + news_dict[impre_id] + "\"?"
            res_dic = {
                "instruction": instruction,
                "input": input,
                "output": output
            }
            data_list.append(res_dic)

res = json.dumps(data_list, indent=4, ensure_ascii=False)
user_sequence_path = "../data/mind/train.json"  # 将生成的训练数据保存起来
with open(user_sequence_path, 'a') as file:
    file.write(res)

3.2 模型微调

有多种方法：

基于alpaca-lora框架进行微调
基于苹果 MLX 框架进行微调
基于 LLaMA-Factory 框架进行微调
基于 Transformers 库的 run_clm.py 进行微调

下面代码是方法 1 的微调代码：
文件finetune_tallrec.py

import os
from typing import List

import fire
import torch
from datasets import load_dataset
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, DataCollatorForSeq2Seq, TrainingArguments

from utils.prompter import Prompter


def train(
        # model/data params
        base_model: str = "",  # the only required argument
        data_path: str = "./data/mind/train.json",
        output_dir: str = "./lora-alpaca",
        # training hyperparams
        batch_size: int = 128,
        micro_batch_size: int = 4,
        num_epochs: int = 3,
        learning_rate: float = 3e-4,
        cutoff_len: int = 256,
        val_set_size: int = 2000,
        # lora hyperparams
        lora_r: int = 8,
        lora_alpha: int = 16,
        lora_dropout: float = 0.05,
        lora_target_modules: List[str] = ["q_proj", "v_proj"],
        # llm hyperparams
        train_on_inputs: bool = True,  # if False, masks out inputs in loss
        add_eos_token: bool = False,
        group_by_length: bool = False,  # faster, but produces an odd training loss curve
        # wandb params
        wandb_project: str = "",
        wandb_run_name: str = "",
        wandb_watch: str = "",  # options: false | gradients | all
        wandb_log_model: str = "",  # options: false | true
        resume_from_checkpoint: str = None,  # either training checkpoint or final adapter
        prompt_template_name: str = "alpaca",  # The prompt template to use, will default to alpaca.
):
    assert (
        base_model
    ), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"
    gradient_accumulation_steps = batch_size // micro_batch_size

    prompter = Prompter(prompt_template_name)

    device_map = "auto"
    world_size = int(os.environ.get("WORLD_SIZE", 1))
    ddp = world_size != 1
    if ddp:
        device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
        gradient_accumulation_steps = gradient_accumulation_steps // world_size

    # Check if parameter passed or if set within environ
    use_wandb = len(wandb_project) > 0 or (
            "WANDB_PROJECT" in os.environ and len(os.environ["WANDB_PROJECT"]) > 0
    )
    # Only overwrite environ if wandb param passed
    if len(wandb_project) > 0:
        os.environ["WANDB_PROJECT"] = wandb_project
    if len(wandb_watch) > 0:
        os.environ["WANDB_WATCH"] = wandb_watch
    if len(wandb_log_model) > 0:
        os.environ["WANDB_LOG_MODEL"] = wandb_log_model

    model = LlamaForCausalLM.from_pretrained(
        base_model,
        torch_dtype=torch.float16,
        device_map=device_map,
    )

    tokenizer = LlamaTokenizer.from_pretrained(base_model)

    tokenizer.pad_token_id = (
        0  # unk. we want this to be different from the eos token
    )
    tokenizer.padding_side = "left"  # Allow batched inference

    def tokenize(prompt, add_eos_token=True):
        # there's probably a way to do this with the tokenizer settings
        # but again, gotta move fast
        result = tokenizer(
            prompt,
            truncation=True,
            max_length=cutoff_len,
            padding=False,
            return_tensors=None,
        )
        if (
                result["input_ids"][-1] != tokenizer.eos_token_id
                and len(result["input_ids"]) < cutoff_len
                and add_eos_token
        ):
            result["input_ids"].append(tokenizer.eos_token_id)
            result["attention_mask"].append(1)

        result["labels"] = result["input_ids"].copy()

        return result

    def generate_and_tokenize_prompt(data_point):
        full_prompt = prompter.generate_prompt(
            data_point["instruction"],
            data_point["input"],
            data_point["output"],
        )
        tokenized_full_prompt = tokenize(full_prompt)
        if not train_on_inputs:
            user_prompt = prompter.generate_prompt(
                data_point["instruction"], data_point["input"]
            )
            tokenized_user_prompt = tokenize(
                user_prompt, add_eos_token=add_eos_token
            )
            user_prompt_len = len(tokenized_user_prompt["input_ids"])

            if add_eos_token:
                user_prompt_len -= 1

            tokenized_full_prompt["labels"] = [
                                                  -100
                                              ] * user_prompt_len + tokenized_full_prompt["labels"][
                                                                    user_prompt_len:
                                                                    ]  # could be sped up, probably
        return tokenized_full_prompt

    model = prepare_model_for_int8_training(model)

    config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        target_modules=lora_target_modules,
        lora_dropout=lora_dropout,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, config)

    if data_path.endswith(".json") or data_path.endswith(".jsonl"):  # 数据是JSON或者JSONL格式
        data = load_dataset("json", data_files=data_path)
    else:
        data = load_dataset(data_path)

    if resume_from_checkpoint:
        # Check the available weights and load them
        checkpoint_name = os.path.join(
            resume_from_checkpoint, "pytorch_model.bin"
        )  # Full checkpoint
        if not os.path.exists(checkpoint_name):
            checkpoint_name = os.path.join(
                resume_from_checkpoint, "adapter_model.bin"
            )  # only LoRA model - LoRA config above has to fit
            resume_from_checkpoint = (
                False  # So the trainer won't try loading its state
            )
        # The two files above have a different name depending on how they were saved, but are actually the same.
        if os.path.exists(checkpoint_name):
            print(f"Restarting from {checkpoint_name}")
            adapters_weights = torch.load(checkpoint_name)
            set_peft_model_state_dict(model, adapters_weights)
        else:
            print(f"Checkpoint {checkpoint_name} not found")

    model.print_trainable_parameters()  # Be more transparent about the % of trainable params.

    if val_set_size > 0:
        train_val = data["train"].train_test_split(
            test_size=val_set_size, shuffle=True, seed=42
        )
        train_data = (
            train_val["train"].shuffle().map(generate_and_tokenize_prompt)
        )
        val_data = (
            train_val["test"].shuffle().map(generate_and_tokenize_prompt)
        )
    else:
        train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
        val_data = None

    if not ddp and torch.cuda.device_count() > 1:
        # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
        model.is_parallelizable = True
        model.model_parallel = True

    trainer = Trainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=val_data,
        args=TrainingArguments(
            per_device_train_batch_size=micro_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            warmup_steps=100,
            num_train_epochs=num_epochs,
            learning_rate=learning_rate,
            logging_steps=10,
            optim="adamw_torch",
            evaluation_strategy="steps" if val_set_size > 0 else "no",
            save_strategy="steps",
            eval_steps=200 if val_set_size > 0 else None,
            save_steps=200,
            output_dir=output_dir,
            save_total_limit=3,
            load_best_model_at_end=True if val_set_size > 0 else False,
            ddp_find_unused_parameters=False if ddp else None,
            group_by_length=group_by_length,
            report_to="wandb" if use_wandb else None,
            run_name=wandb_run_name if use_wandb else None,
        ),
        data_collator=DataCollatorForSeq2Seq(
            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
        ),
    )
    model.config.use_cache = False

    trainer.train(resume_from_checkpoint=resume_from_checkpoint)

    model.save_pretrained(output_dir)


if __name__ == "__main__":
    fire.Fire(train)

文件finetune_tallrec.sh


python finetune_tallrec.py \
    --base_model '/Users/liuqiang/Desktop/code/llm/models/chinese-alpaca-2-7b' \
    --data_path '/Users/liuqiang/Desktop/code/llm4rec/llm4rec_abc/src/basic_skills/finetune-llm/data/mind/train.json' \
    --output_dir './lora-weights' \
    --batch_size 128 \
    --micro_batch_size 4 \
    --num_epochs 1 \
    --learning_rate 5e-4 \
    --cutoff_len 512 \
    --val_set_size 10000 \
    --lora_r 8 \
    --lora_alpha 16 \
    --lora_dropout 0.05 \
    --lora_target_modules '[q_proj,v_proj]' \
    --train_on_inputs \
    --group_by_length

3.3 模型推断

微调后，进行推断，检验效果是否更加精准。
文件infer_tallrec.py

import os
import sys

import fire
import gradio as gr
import torch
import transformers
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer

from utils.callbacks import Iteratorize, Stream
from utils.prompter import Prompter


def main(
        load_8bit: bool = False,
        base_model: str = "",
        lora_weights: str = "tloen/alpaca-lora-7b",
        prompt_template: str = "",  # The prompt template to use, will default to alpaca.
        server_name: str = "0.0.0.0",  # Allows to listen on all interfaces by providing '0.
        share_gradio: bool = False,
):
    base_model = base_model or os.environ.get("BASE_MODEL", "")
    assert (
        base_model
    ), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"

    prompter = Prompter(prompt_template)
    tokenizer = LlamaTokenizer.from_pretrained(base_model)

    device_map = "mps"

    model = LlamaForCausalLM.from_pretrained(
        base_model,
        device_map=device_map,
        torch_dtype=torch.float16,
    )
    model = PeftModel.from_pretrained(
        model,
        lora_weights,
        device_map=device_map,
        torch_dtype=torch.float16,
    )

    # unwind broken decapoda-research config
    model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
    model.config.bos_token_id = 1
    model.config.eos_token_id = 2

    if not load_8bit:
        model.half()  # seems to fix bugs for some users.

    model.eval()
    if torch.__version__ >= "2" and sys.platform != "win32":
        model = torch.compile(model)

    def evaluate(
            instruction,
            input=None,
            temperature=0.1,
            top_p=0.75,
            top_k=40,
            num_beams=4,
            max_new_tokens=128,
            stream_output=False,
            **kwargs,
    ):
        prompt = prompter.generate_prompt(instruction, input)
        inputs = tokenizer(prompt, return_tensors="pt")
        input_ids = inputs["input_ids"].to(device_map)
        generation_config = GenerationConfig(
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_beams=num_beams,
            **kwargs,
        )

        generate_params = {
            "input_ids": input_ids,
            "generation_config": generation_config,
            "return_dict_in_generate": True,
            "output_scores": True,
            "max_new_tokens": max_new_tokens,
        }

        if stream_output:
            # Stream the reply 1 token at a time.
            # This is based on the trick of using 'stopping_criteria' to create an iterator,
            # from https://github.com/oobabooga/text-generation-webui/blob/ad37f396fc8bcbab90e11ecf17c56c97bfbd4a9c/modules/text_generation.py#L216-L243.
            def generate_with_callback(callback=None, **kwargs):
                kwargs.setdefault(
                    "stopping_criteria", transformers.StoppingCriteriaList()
                )
                kwargs["stopping_criteria"].append(
                    Stream(callback_func=callback)
                )
                with torch.no_grad():
                    model.generate(**kwargs)

            def generate_with_streaming(**kwargs):
                return Iteratorize(
                    generate_with_callback, kwargs, callback=None
                )

            with generate_with_streaming(**generate_params) as generator:
                for output in generator:
                    # new_tokens = len(output) - len(input_ids[0])
                    decoded_output = tokenizer.decode(output)

                    if output[-1] in [tokenizer.eos_token_id]:
                        break

                    yield prompter.get_response(decoded_output)
            return  # early return for stream_output

        # Without streaming
        with torch.no_grad():
            generation_output = model.generate(
                input_ids=input_ids,
                generation_config=generation_config,
                return_dict_in_generate=True,
                output_scores=True,
                max_new_tokens=max_new_tokens,
            )
        s = generation_output.sequences[0]
        output = tokenizer.decode(s)
        yield prompter.get_response(output)

    gr.Interface(
        fn=evaluate,
        inputs=[
            gr.components.Textbox(
                lines=2,
                label="Instruction",
                placeholder="Tell me about alpacas.",
            ),
            gr.components.Textbox(lines=2, label="Input", placeholder="none"),
            gr.components.Slider(
                minimum=0, maximum=1, value=0.1, label="Temperature"
            ),
            gr.components.Slider(
                minimum=0, maximum=1, value=0.75, label="Top p"
            ),
            gr.components.Slider(
                minimum=0, maximum=100, step=1, value=40, label="Top k"
            ),
            gr.components.Slider(
                minimum=1, maximum=4, step=1, value=4, label="Beams"
            ),
            gr.components.Slider(
                minimum=1, maximum=2000, step=1, value=128, label="Max tokens"
            ),
            gr.components.Checkbox(label="Stream output"),
        ],
        outputs=[
            gr.components.Textbox(
                lines=5,
                label="Output",
            )
        ],
        title="🦙🌲 Alpaca-LoRA",
        description="Alpaca-LoRA is a 7B-parameter LLaMA model finetuned to follow instructions. It is trained on the [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) dataset and makes use of the Huggingface LLaMA implementation. For more information, please visit [the project's website](https://github.com/tloen/alpaca-lora).",
        # noqa: E501
    ).queue().launch(server_name=server_name, share=share_gradio)


if __name__ == "__main__":
    fire.Fire(main)

文件infer_tallrec.sh

python infer_tallrec.py \
    --base_model '/Users/liuqiang/Desktop/code/llm/models/chinese-alpaca-2-7b' \
    --lora_weights '/Users/liuqiang/Desktop/code/llm4rec/llm4rec_abc/src/finetune-llm/lora-weights'