大模型在电商推荐中的应用

1、冷启动

1.1 数据准备

由于涉及冷启动商品生成模拟的用户行为，因此需要确定哪些是冷启动商品。

将用户行为数据按照时间排序，前 70%作为训练样本，后 30%作为测试样本。

只在测试数据中出现但不在训练数据中出现的商品就被认为是冷启动商品。

代码cold_start/utils/utils.py

import csv
import json

import pandas as pd

TRAIN_RATIO = 0.7


# 读取相关数据
def parse(path):
    g = open(path, 'r')
    for row in g:
        yield json.loads(row)


# 将数据存为 DataFrame 格式，方便后续处理
def get_df(path):
    i = 0
    df_ = {}
    for d in parse(path):
        df_[i] = d
        i += 1
    return pd.DataFrame.from_dict(df_, orient='index')


def get_cold_start_items(path_review: str = "../data/amazon_review/beauty/All_Beauty_5.json") -> set[str]:
    """
    将用户行为数据按照时间升序排列，取后面的30%的数据，该数据中的item在前面的70%中不存在，就认为是冷启动数据
    :param path_review: 用户行为数据目录
    :return: 冷启动物品
    """

    df_view = get_df(path_review)

    # 对unixReviewTime升序排序
    df_view.sort_values('unixReviewTime', ascending=True, inplace=True)
    df_view = df_view.reset_index(drop=True)

    rows_num = df_view.shape[0]
    train_num = int(rows_num * 0.7)

    train_df = df_view.head(train_num)
    test_df = df_view.iloc[train_num:]

    train_items = set(train_df['asin'].unique())  # 71个
    test_items = set(test_df['asin'].unique())  # 44个

    cold_start_items = test_items.difference(train_items)  # 14个

    return cold_start_items


def get_user_history(path_review: str = "../data/amazon_review/beauty/All_Beauty_5.json",
                     data_type: str = "train") -> dict:
    """
    将用户行为数据按照时间升序排列，取用户行为字典
    :param data_type: 是取前面70%的训练数据，还是后面30%的测试数据
    :param path_review: 用户行为数据目录
    :return: 用户行为历史
    """
    df_view = get_df(path_review)

    # 对unixReviewTime升序排序
    df_view.sort_values('unixReviewTime', ascending=True, inplace=True)
    df_view = df_view.reset_index(drop=True)

    rows_num = df_view.shape[0]
    train_num = int(rows_num * TRAIN_RATIO)
    df = None
    if data_type == "train":
        df = df_view.head(train_num)
    if data_type == "test":
        df = df_view.iloc[train_num:]

    grouped = df.groupby('reviewerID')
    """
        >>> grouped.get_group('A105A034ZG9EHO')
      overall  verified  reviewTime      reviewerID        asin              style reviewerName reviewText     summary  unixReviewTime vote image
1246      5.0      True  07 6, 2014  A105A034ZG9EHO  B0009RF9DW  {'Size:': ' 180'}      K. Mras        yum  Five Stars      1404604800  NaN   NaN
1247      5.0      True  07 6, 2014  A105A034ZG9EHO  B000FI4S1E                NaN      K. Mras        yum  Five Stars      1404604800  NaN   NaN
1250      5.0      True  07 6, 2014  A105A034ZG9EHO  B0012Y0ZG2  {'Size:': ' 180'}      K. Mras        yum  Five Stars      1404604800  NaN   NaN
1252      5.0      True  07 6, 2014  A105A034ZG9EHO  B000URXP6E  {'Size:': ' 180'}      K. Mras        yum  Five Stars      1404604800  NaN   NaN
1253      5.0      True  07 6, 2014  A105A034ZG9EHO  B0012Y0ZG2  {'Size:': ' 180'}      K. Mras        yum  Five Stars      1404604800  NaN   NaN
    """
    user_history_dict = {}
    for name, group in grouped:
        reviewerID = name
        asin = group['asin']
        user_history_dict[reviewerID] = set(asin)

    return user_history_dict


def get_metadata_dict(path: str = '../data/amazon_review/beauty/meta_All_Beauty.json') -> dict:
    """
    读取商品metadata数据，将商品的核心进行取出来，方便后面大模型使用
    :param path: 商品metadata数据目录
    :return: 商品信息字典
    """
    item_dict = {}  # meta_All_Beauty.json中获取每个item对应的信息
    """
    {"category": [], "tech1": "", "description": ["Start Up combines citrus essential oils with gentle Alpha Hydroxy Acids to cleanse and refresh your face. The 5% AHA level is gentle enough for all skin types.", "", ""], 
    "fit": "", "title": "Kiss My Face Exfoliating Face Wash Start Up, 4 Fluid Ounce", 
    "also_buy": ["B000Z96JDI", "B00006IGL8", "B007C5X34G", "B00006IGLF", "B00213WCNC", "B00D1W1QXE", "B001FB5HZG", "B000FQ86RI", "B0012BSKBM", "B0085EVLRO", "B00A2EXVQE"], 
    "tech2": "", "brand": "Kiss My Face", "feature": [], "rank": [], 
    "also_view": ["B000Z96JDI", "B001FB5HZG", "B00213WCNC", "B00BBFOVO4", "B0085EVLRO"], 
    "details": {"\n    Product Dimensions: \n    ": "2.5 x 1.6 x 7 inches ; 4 ounces", "Shipping Weight:": "4 ounces", "ASIN: ": "B00006IGL2", "UPC:": "890795851488 701320351987 601669038184 793379218755 028367831938 787734768894 756769626417", "Item model number:": "1200040"},
     "main_cat": "All Beauty", "similar_item": "", "date": "", "price": "", 
     "asin": "B00006IGL2", "imageURL": ["https://images-na.ssl-images-amazon.com/images/I/41i07fBAznL._SS40_.jpg", 
     "https://images-na.ssl-images-amazon.com/images/I/31W8DZRVD1L._SS40_.jpg"], "imageURLHighRes": ["https://images-na.ssl-images-amazon.com/images/I/41i07fBAznL.jpg", "https://images-na.ssl-images-amazon.com/images/I/31W8DZRVD1L.jpg"]}

    """
    with open(path, 'r') as file:
        reader = csv.reader(file, delimiter='\n')
        for row in reader:
            j = json.loads(row[0])
            item_id = j['asin']
            title = j['title']
            brand = j['brand']
            description = j['description']
            price = j['price']
            if price != "" and '$' not in price and len(price) > 10:  # 处理一些异常数据情况
                price = ""
            item_info = {
                "title": title,
                "brand": brand,
                "description": description,
                "price": price
            }
            item_dict[item_id] = item_info
    return item_dict

将测试数据生成为微调样本。
代码cold_start/data-process/generate_finetune_data.py

import importlib
import json
import sys

sys.path.append('../utils')
utils = importlib.import_module('utils')

TRAIN_RATIO = 0.7

instruction = ("You are a product expert who predicts which products "
               "users prefer based on your professional knowledge.")


def formatting_input(history, candidate):
    input = f"""The user purchased the following beauty products(in JSON format): 

{history}

Predict if the user will prefer to purchase the following beauty candidate list(in JSON format):

{candidate} 

You can choice none, one or more, your output must be JSON format, you just need output item_id, the following is an
output example, A and B is product item_id.

["A", "B"]

Your output must in the candidate list, don't explain.
"""

    return input


"""
按照如下格式构建训练数据集：
[
    {
        "instruction": "You are a product expert who predicts which products users prefer based on your professional knowledge.",
        "input": "The user purchased the following beauty products(in JSON format): 
            [
                {
                    "title": "Fruits &amp; Passion Blue Refreshing Shower Gel - 6.7 fl. oz.",
                    "brand": "Fruits & Passion",
                    "price": "",
                    "item_id": "B000FI4S1E"
                },
                {
                    "title": "Yardley By Yardley Of London Unisexs Lay It On Thick Hand &amp; Foot Cream 5.3 Oz",
                    "brand": "Yardley",
                    "price": "",
                    "item_id": "B0009RF9DW"
                }
            ]
            
        Predict if the user will prefer to purchase the following beauty candidate list(in JSON format):
        
           [
                {
                    "title": "Helen of Troy 1579 Tangle Free Hot Air Brush, White, 3/4 Inch Barrel",
                    "brand": "Helen Of Troy",
                    "price": "$28.70",
                    "item_id": "B000WYJTZG"
                },
                {
                    "title": "Dolce &amp; Gabbana Compact Parfum, 0.05 Ounce",
                    "brand": "Dolce & Gabbana",
                    "price": "",
                    "item_id": "B019V2KYZS"
                },
                ...
            ]
        "output": '["B0012Y0ZG2","B000URXP6E"]'
    },
    ...
]
"""


def generate_data(output_path: str = '../data/train.json'):
    item_dict = utils.get_metadata_dict()
    train_user_dict = utils.get_user_history(data_type="train")
    cold_start_items = utils.get_cold_start_items()
    action_items = set()
    for _, items in train_user_dict.items():
        action_items = action_items.union(items)
    unique_items = action_items.difference(cold_start_items)  # 这里是测试集中不在冷启动中的item集合

    C = []
    for item in unique_items:
        info = item_dict[item]
        info['item_id'] = item
        if 'description' in info:
            del info['description']  # description 字段太长了，消耗的token太多，剔除掉
        C.append(info)
    candidate = json.dumps(C, indent=4, ensure_ascii=False)  # 这是所有训练集中不在冷启动id的item

    data_list = []
    for user, history in train_user_dict.items():
        H = []
        history = [item for item in history if item not in cold_start_items]
        if len(history) > 1:  # 该用户至少还剩余2个action items
            train_num = int(len(history) * TRAIN_RATIO)
            train_history = history[:train_num]
            test_history = history[train_num:]
            for h in train_history:
                info = item_dict[h]
                if 'description' in info:
                    del info['description']  # description 字段太长了，消耗的token太多，剔除掉
                H.append(info)
            HH = json.dumps(H, indent=4, ensure_ascii=False)
            output = json.dumps(test_history, indent=4, ensure_ascii=False)
            input = formatting_input(HH, candidate)
            d = {
                "instruction": instruction,
                "input": input,
                "output": output
            }
            data_list.append(d)

    train_res = json.dumps(data_list, indent=4, ensure_ascii=False)
    with open(output_path, 'a') as file_:  # 将生成的训练数据保存起来
        file_.write(train_res)


if __name__ == "__main__":
    generate_data()

1.2 生成冷启动商品的行为样本

如何为冷启动商品生成模拟的用户行为呢？

在训练集中随机找20%的用户
在冷启动商品中随机找两个商品
让大模型基于用户过往行为，选择一个用户可能喜欢的商品

代码cold_start/data-process/generate_samples.py

import importlib
import json
import os
import random
import sys
import time

from openai import OpenAI

sys.path.append('../utils')
utils = importlib.import_module('utils')

from dotenv_vault import load_dotenv  # pip install --upgrade python-dotenv-vault

load_dotenv()  # https://vault.dotenv.org/ui/ui1

MOONSHOT_API_KEY = os.getenv("MOONSHOT_API_KEY")

instruction = ("You are a product expert who predicts which of the two products "
               "users prefer based on your professional knowledge.")


def formatting_prompt(History, item_a, item_b):
    prompt = f"""The user purchased the following beauty products in JSON format: 

{History}

Predict if the user will prefer to purchase product A or B in the next.

A is:

{item_a} 

B is:

{item_b}

Your answer must be A or B, don't explain.
"""

    return prompt


def generate_cold_start_samples(store_path: str = '../data/cold_start_action_sample.json'):
    item_dict = utils.get_metadata_dict()
    user_history = utils.get_user_history(data_type="train")
    cold_start_items = utils.get_cold_start_items()

    generated_samples = []
    i = 0
    for user, history in user_history.items():
        rd = random.random()
        if rd < 0.2:  # 随机选择20%的用户
            random_2_elements = random.sample(list(cold_start_items), 2)
            H = []
            for h in history:
                info = item_dict[h]
                H.append(info)
            HH = json.dumps(H, indent=4, ensure_ascii=False)
            A = item_dict[random_2_elements[0]]
            B = item_dict[random_2_elements[1]]
            AA = json.dumps(A, indent=4, ensure_ascii=False)
            BB = json.dumps(B, indent=4, ensure_ascii=False)
            prom = formatting_prompt(HH, AA, BB)

            client = OpenAI(
                api_key=MOONSHOT_API_KEY,
                base_url="https://api.moonshot.cn/v1",
            )
            llm_response = client.chat.completions.create(
                model="moonshot-v1-32k",  # moonshot-v1-8k 、moonshot-v1-32k、moonshot-v1-128k
                messages=[
                    {
                        "role": "system",
                        "content": instruction,
                    },
                    {"role": "user", "content": prom},
                ],
                temperature=0.1,
                stream=False,
            )
            choice = llm_response.choices[0].message.content.strip()
            sample = {}
            if choice == "A":
                sample = {
                    "user": user,
                    "item": random_2_elements[0]
                }
                generated_samples.append(sample)
            if choice == "B":
                sample = {
                    "user": user,
                    "item": random_2_elements[1]
                }
            i += 1
            print("-------------- " + str(i) + " -----------------")
            print(json.dumps(sample, indent=4, ensure_ascii=False))
            generated_samples.append(sample)
            if i % 7 == 0:
                time.sleep(1)  # 避免moonshot认为调用太频繁不合法

    res = json.dumps(generated_samples, indent=4, ensure_ascii=False)

    with open(store_path, 'a') as file:  # 将生成的训练数据保存起来
        file.write(res)


if __name__ == "__main__":
    generate_cold_start_samples()

一旦为冷启动商品生成了模拟行为，就可以加入真实用户行为数据中。
冷启动商品慢慢变热，就可以采用传统的推荐算法模型了。

1.3 上下文学习能力

冷启动召回：将用户在测试集中的行为作为用户的兴趣历史，然后将所有冷启动商品作为候选集，让大模型筛选出一些用户可能感兴趣的。

代码cold_start/item_cold_start_rec.py

import json
import os
import time

import torch
from dotenv_vault import load_dotenv  # pip install --upgrade python-dotenv-vault
from openai import OpenAI
from transformers import AutoTokenizer, AutoModelForCausalLM

from utils.utils import get_metadata_dict, get_user_history, get_cold_start_items

load_dotenv()  # https://vault.dotenv.org/ui/ui1

MOONSHOT_API_KEY = os.getenv("MOONSHOT_API_KEY")

instruction = ("You are a product expert who predicts which products "
               "users prefer based on your professional knowledge.")


def formatting_prompt(history, candidate):
    prompt = f"""The user purchased the following beauty products(in JSON format): 

{history}

Predict if the user will prefer to purchase the following beauty candidate list(in JSON format):

{candidate} 

You can choice none, one or more, your output must be JSON format, you just need output item_id, the following is an
output example, A and B is product item_id.

["A", "B"]

Your output must in the candidate list, don't explain.
"""

    return prompt


def llm_api_cold_start_rec(store_path: str = 'data/llm_api_rec.json'):
    item_dict = get_metadata_dict()
    train_user_dict = get_user_history(data_type="train")
    test_user_dict = get_user_history(data_type="test")
    common_users = set(train_user_dict.keys()).intersection(set(test_user_dict.keys()))
    cold_start_items = get_cold_start_items()

    generated_rec = []
    print("total user number = " + str(len(common_users)))
    i = 0
    for user in common_users:
        H = []
        for h in train_user_dict[user]:
            info = item_dict[h]
            if 'description' in info:
                del info['description']  # description 字段太长了，消耗的token太多，剔除掉
            H.append(info)
        history = json.dumps(H, indent=4, ensure_ascii=False)
        C = []
        for item in cold_start_items:
            info = item_dict[item]
            info['item_id'] = item
            if 'description' in info:
                del info['description']  # description 字段太长了，消耗的token太多，剔除掉
            C.append(info)
        candidate = json.dumps(C, indent=4, ensure_ascii=False)

        prom = formatting_prompt(history, candidate)
        client = OpenAI(
            api_key=MOONSHOT_API_KEY,
            base_url="https://api.moonshot.cn/v1",
        )
        llm_response = client.chat.completions.create(
            model="moonshot-v1-32k",  # moonshot-v1-8k 、moonshot-v1-32k、moonshot-v1-128k
            messages=[
                {
                    "role": "system",
                    "content": instruction,
                },
                {"role": "user", "content": prom},
            ],
            temperature=0.1,
            stream=False,
        )
        content = llm_response.choices[0].message.content.strip()
        rec = {
            "user": user,
            "rec": content
        }
        i += 1
        print("-------------- " + str(i) + " -----------------")
        print(json.dumps(rec, indent=4, ensure_ascii=False))
        generated_rec.append(rec)
        if i % 7 == 0:
            time.sleep(1)  # 避免moonshot认为调用太频繁不合法

    res = json.dumps(generated_rec, indent=4, ensure_ascii=False)

    with open(store_path, 'a') as file:  # 将生成的训练数据保存起来
        file.write(res)


def openllm_cold_start_rec(model_path: str = '/Users/liuqiang/Desktop/code/llm/models/Qwen1.5-4B',
                           store_path: str = 'data/openllm_rec.json'):
    item_dict = get_metadata_dict()
    train_user_dict = get_user_history(data_type="train")
    test_user_dict = get_user_history(data_type="test")
    common_users = set(train_user_dict.keys()).intersection(set(test_user_dict.keys()))
    cold_start_items = get_cold_start_items()

    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map="auto",
        torch_dtype=torch.float16,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    tokenizer.padding_side = 'right'

    generated_rec = []
    print("total user number = " + str(len(common_users)))
    i = 0
    for user in common_users:
        H = []
        for h in train_user_dict[user]:
            info = item_dict[h]
            if 'description' in info:
                del info['description']  # description 字段太长了，消耗的token太多，剔除掉
            H.append(info)
        history = json.dumps(H, indent=4, ensure_ascii=False)
        C = []
        for item in cold_start_items:
            info = item_dict[item]
            info['item_id'] = item
            if 'description' in info:
                del info['description']  # description 字段太长了，消耗的token太多，剔除掉
            C.append(info)
        candidate = json.dumps(C, indent=4, ensure_ascii=False)

        input = formatting_prompt(history, candidate)

        prompt = f"""### Instruction:
                    {instruction}

                    ### Input:
                    {input}

                    ### Response:
                    """

        input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
        outputs = model.generate(input_ids=input_ids.to('mps'),
                                 max_new_tokens=1500, pad_token_id=tokenizer.eos_token_id)
        predict_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0][len(prompt):]

        rec = {
            "user": user,
            "rec": predict_output
        }
        i += 1
        print("-------------- " + str(i) + " -----------------")
        print(json.dumps(rec, indent=4, ensure_ascii=False))
        generated_rec.append(rec)
        if i % 7 == 0:
            time.sleep(1)  # 避免moonshot认为调用太频繁不合法

    res = json.dumps(generated_rec, indent=4, ensure_ascii=False)

    with open(store_path, 'a') as file:  # 将生成的训练数据保存起来
        file.write(res)


if __name__ == "__main__":
    llm_api_cold_start_rec()
    openllm_cold_start_rec(model_path='./models', store_path='data/openllm_finetune_rec.json')
    openllm_cold_start_rec()

1.4 模型微调

代码cold_start/model_finetune.py

import os
from typing import List

import fire
import torch
from datasets import load_dataset
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_int8_training,
    set_peft_model_state_dict, PeftModel,
)
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, DataCollatorForSeq2Seq, TrainingArguments

from utils.prompter import Prompter


def train(
        # model/data params
        base_model: str = "",  # the only required argument
        data_path: str = "./data/train.json",
        output_dir: str = "./models",
        # training hyperparams
        batch_size: int = 128,
        micro_batch_size: int = 4,
        num_epochs: int = 3,
        learning_rate: float = 3e-4,
        cutoff_len: int = 256,
        val_set_size: int = 2000,
        # lora hyperparams
        lora_r: int = 8,
        lora_alpha: int = 16,
        lora_dropout: float = 0.05,
        lora_target_modules: List[str] = [
            "q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"
        ],
        # llm hyperparams
        train_on_inputs: bool = True,  # if False, masks out inputs in loss
        add_eos_token: bool = False,
        group_by_length: bool = False,  # faster, but produces an odd training loss curve
        # wandb params
        wandb_project: str = "",
        wandb_run_name: str = "",
        wandb_watch: str = "",  # options: false | gradients | all
        wandb_log_model: str = "",  # options: false | true
        resume_from_checkpoint: str = None,  # either training checkpoint or final adapter
        prompt_template_name: str = "alpaca",  # The prompt template to use, will default to alpaca.
):
    assert (
        base_model
    ), "Please specify a --base_model, e.g. --base_model='huggyllama/llama-7b'"
    gradient_accumulation_steps = batch_size // micro_batch_size

    prompter = Prompter(prompt_template_name)

    device_map = "auto"
    world_size = int(os.environ.get("WORLD_SIZE", 1))
    ddp = world_size != 1
    if ddp:
        device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
        gradient_accumulation_steps = gradient_accumulation_steps // world_size

    # Check if parameter passed or if set within environ
    use_wandb = len(wandb_project) > 0 or (
            "WANDB_PROJECT" in os.environ and len(os.environ["WANDB_PROJECT"]) > 0
    )
    # Only overwrite environ if wandb param passed
    if len(wandb_project) > 0:
        os.environ["WANDB_PROJECT"] = wandb_project
    if len(wandb_watch) > 0:
        os.environ["WANDB_WATCH"] = wandb_watch
    if len(wandb_log_model) > 0:
        os.environ["WANDB_LOG_MODEL"] = wandb_log_model

    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        torch_dtype=torch.float16,
        device_map=device_map,
    )

    tokenizer = AutoTokenizer.from_pretrained(base_model)

    tokenizer.pad_token_id = (
        0  # unk. we want this to be different from the eos token
    )
    tokenizer.padding_side = "left"  # Allow batched inference

    def tokenize(prompt, add_eos_token=True):
        # there's probably a way to do this with the tokenizer settings
        # but again, gotta move fast
        result = tokenizer(
            prompt,
            truncation=True,
            max_length=cutoff_len,
            padding=False,
            return_tensors=None,
        )
        if (
                result["input_ids"][-1] != tokenizer.eos_token_id
                and len(result["input_ids"]) < cutoff_len
                and add_eos_token
        ):
            result["input_ids"].append(tokenizer.eos_token_id)
            result["attention_mask"].append(1)

        result["labels"] = result["input_ids"].copy()

        return result

    def generate_and_tokenize_prompt(data_point):
        full_prompt = prompter.generate_prompt(
            data_point["instruction"],
            data_point["input"],
            data_point["output"],
        )
        tokenized_full_prompt = tokenize(full_prompt)
        if not train_on_inputs:
            user_prompt = prompter.generate_prompt(
                data_point["instruction"], data_point["input"]
            )
            tokenized_user_prompt = tokenize(
                user_prompt, add_eos_token=add_eos_token
            )
            user_prompt_len = len(tokenized_user_prompt["input_ids"])

            if add_eos_token:
                user_prompt_len -= 1

            tokenized_full_prompt["labels"] = [
                                                  -100
                                              ] * user_prompt_len + tokenized_full_prompt["labels"][
                                                                    user_prompt_len:
                                                                    ]  # could be sped up, probably
        return tokenized_full_prompt

    model = prepare_model_for_int8_training(model)

    config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        target_modules=lora_target_modules,
        lora_dropout=lora_dropout,
        bias="none",
        task_type="CAUSAL_LM",
    )
    peft_model = get_peft_model(model, config)

    if data_path.endswith(".json") or data_path.endswith(".jsonl"):  # 数据是JSON或者JSONL格式
        data = load_dataset("json", data_files=data_path)
    else:
        data = load_dataset(data_path)

    if resume_from_checkpoint:
        # Check the available weights and load them
        checkpoint_name = os.path.join(
            resume_from_checkpoint, "pytorch_model.bin"
        )  # Full checkpoint
        if not os.path.exists(checkpoint_name):
            checkpoint_name = os.path.join(
                resume_from_checkpoint, "adapter_model.bin"
            )  # only LoRA model - LoRA config above has to fit
            resume_from_checkpoint = (
                False  # So the trainer won't try loading its state
            )
        # The two files above have a different name depending on how they were saved, but are actually the same.
        if os.path.exists(checkpoint_name):
            print(f"Restarting from {checkpoint_name}")
            adapters_weights = torch.load(checkpoint_name)
            set_peft_model_state_dict(peft_model, adapters_weights)
        else:
            print(f"Checkpoint {checkpoint_name} not found")

    peft_model.print_trainable_parameters()  # Be more transparent about the % of trainable params.

    if val_set_size > 0:
        train_val = data["train"].train_test_split(
            test_size=val_set_size, shuffle=True, seed=42
        )
        train_data = (
            train_val["train"].shuffle().map(generate_and_tokenize_prompt)
        )
        val_data = (
            train_val["test"].shuffle().map(generate_and_tokenize_prompt)
        )
    else:
        train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
        val_data = None

    if not ddp and torch.cuda.device_count() > 1:
        # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
        peft_model.is_parallelizable = True
        peft_model.model_parallel = True

    trainer = Trainer(
        model=peft_model,
        train_dataset=train_data,
        eval_dataset=val_data,
        args=TrainingArguments(
            per_device_train_batch_size=micro_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            warmup_steps=100,
            num_train_epochs=num_epochs,
            learning_rate=learning_rate,
            logging_steps=10,
            optim="adamw_torch",
            evaluation_strategy="steps" if val_set_size > 0 else "no",
            save_strategy="steps",
            eval_steps=200 if val_set_size > 0 else None,
            save_steps=200,
            output_dir=output_dir,
            save_total_limit=3,
            load_best_model_at_end=True if val_set_size > 0 else False,
            ddp_find_unused_parameters=False if ddp else None,
            group_by_length=group_by_length,
            report_to="wandb" if use_wandb else None,
            run_name=wandb_run_name if use_wandb else None,
        ),
        data_collator=DataCollatorForSeq2Seq(
            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
        ),
    )
    peft_model.config.use_cache = False

    trainer.train(resume_from_checkpoint=resume_from_checkpoint)

    # lora权重保存
    trainer.model.save_pretrained(output_dir)  # 保存模型向量
    tokenizer.save_pretrained(output_dir)  # 保存token

    # lora权重跟原始模型合并，并保存
    model_to_merge = PeftModel.from_pretrained(
        AutoModelForCausalLM.from_pretrained(base_model), output_dir)

    merged_model = model_to_merge.merge_and_unload()
    merged_model.save_pretrained(output_dir)


if __name__ == "__main__":
    fire.Fire(train)

# 这里的模型finetune代码跟personalized_rec中的完全一样

1.5 模型效果评估

将平均精准率、平均召回率作为比较指标，下面是比较 1.3 中的上下文学习能力推荐的情况。

import importlib
import json
import sys

sys.path.append('utils')
utils = importlib.import_module('utils')

REC_MUM = 8


def precision(rec_list: list, action_list: list) -> float:
    """
    计算单个用户推荐的精准度
    :param rec_list: 算法的推荐列表
    :param action_list: 用户实际的购买列表
    :return: 精准度
    """
    num = len(set(rec_list))
    if num > 0:
        return len(set(rec_list).intersection(set(action_list))) / num
    else:
        return 0.0


def recall(rec_list: list, action_list: list) -> float:
    """
    计算单个用户推荐的召回率
    :param rec_list: 算法的推荐列表
    :param action_list: 用户实际的购买列表
    :return: 召回率
    """
    num = len(set(action_list))
    if num > 0:
        return len(set(rec_list).intersection(set(action_list))) / num
    else:
        return 0.0


def find_all_occurrences(s, char):
    start = s.find(char)
    indices = []
    while start != -1:
        indices.append(start)
        start = s.find(char, start + 1)
    return indices


def evaluate(data_path: str, model_type: str) -> (float, float):
    test_user_dict = utils.get_user_history(data_type="test")
    test_users = test_user_dict.keys()
    j = ""
    with open(data_path, 'r') as file:
        j = file.read()
    rec = json.loads(j)

    common_num = 0  # 推荐的用户和实际测试集的用户的交集数量
    acc_p = 0.0  # 累积精准度
    acc_r = 0.0  # 累积召回率
    for x in rec:
        user = x['user']
        if user in test_users:
            common_num += 1
            action_list = test_user_dict[user]
            temp = x['rec']
            rec_list = None
            if model_type == 'llm_api':
                rec_list = eval(temp)
            elif model_type in ['openllm', 'openllm_finetune']:  # 千问模型和微调的模型生成的结构比较复杂，需要特殊处理
                # print(temp)
                loc_list = find_all_occurrences(temp, '"item_id": "')
                rec_list = []
                for loc in loc_list:
                    item_id = temp[loc + len('"item_id": "'): loc + len('"item_id": "') + 10]  # item_id长度为10
                    rec_list.append(item_id)
                # print(rec_list)
            rec_list = rec_list[:REC_MUM]  # 最多推荐REC_MUM，避免不同模型推荐的数量不一样，对比不公平
            p = precision(rec_list, action_list)
            r = recall(rec_list, action_list)
            acc_p += p
            acc_r += r

    avg_p = acc_p / common_num
    avg_r = acc_r / common_num

    return avg_p, avg_r


if __name__ == "__main__":
    llm_api_avg_p, llm_api_avg_r = evaluate('data/llm_api_rec.json', model_type='llm_api')
    openllm_avg_p, openllm_avg_r = evaluate('data/openllm_rec.json', model_type='openllm')
    openllm_finetune_avg_p, openllm_finetune_avg_r = (
        evaluate('data/openllm_finetune_rec.json', model_type='openllm_finetune'))

    res = [
        {
            "llm_api_avg_p": llm_api_avg_p,
            "llm_api_avg_r": llm_api_avg_r
        },
        {
            "openllm_avg_p": openllm_avg_p,
            "openllm_avg_r": openllm_avg_r
        },
        {
            "openllm_finetune_avg_p": openllm_finetune_avg_p,
            "openllm_finetune_avg_r": openllm_finetune_avg_r
        }
    ]

    print(json.dumps(res, indent=4, ensure_ascii=False))

最终效果发现：Qwen-4B微调模型的效果反而不如没有微调的。
可能原因：

微调样本数量太少，没有学习到重要的规律

提供的数据不够好，缺失某种特征，引入了噪声

模型参数比较少，根本难以解决这个问题

2、生成用户兴趣画像

3、生成个性化商品描述

4、猜你喜欢推荐

5、电商关联推荐

also_buy买了该商品的用户还买了
also_view浏览了该商品的用户还浏览了
前者更强烈，相似度设置为2，后者设置为 1
负样本的相似度设置为 0
正负样本数量应该差不多

5.1 数据准备

构建训练、测试样本similar_rec/data-process/generate_funetune_data.py

import json
import random

"""
按照如下格式构建训练、测试数据集：

{

"instruction": "You are a product expert who judges whether 
two products are similar based on your professional knowledge.", 

"input": "I will provide you with two product related introduction information, as follows(in JSON format):

[
    {
        "title": "SF221-Shaving Factory Straight Razor (Black), Shaving Factory Hand Made Shaving Brush, 100...",
        "brand": "Shaving Factory",
        "price": "$21.95",
        "description": ["Start Up combines citrus essential oils with gentle Alpha Hydroxy Acids to cleanse and refresh
            your face. The 5% AHA level is gentle enough for all skin types.", "", ""],
    },
    {
        "title": "Loud 'N Clear&trade; Personal Sound Amplifier",
        "brand": "idea village",
        "price": "",
        "description": ["Loud 'N Clear Personal Sound Amplifier allows you to turn up the volume on what people around 
            you are saying, listen at the level you want without disturbing others, hear a pin drop from across the room."],
    }
]

Based on above information, please predict if these two products are similar. The similarity  is between 0 and 2, 
0 being lowest and 2 being highest. You just need to ranking the above product, do not explain the reason.

"output": "0"

}

"""

"""
构建训练集、测试集的思路：
基于metadata数据集中also_buy、also_view 字段，某个商品与also_buy、also_view中的商品认为是相似的，这些可以做为正样本。
但他们的相似度应该不一样，also_buy是更强烈的偏好，我们设置相似度为2，also_view设置为1
为了让训练样本更加平衡，可以随机选择两个商品对做为负样本，选择负样本的数量跟正样本差不多。负样本设置相似度为0。
下面就基于这个思路来进行处理。

"""

instruction = ("You are a product expert who judges whether "
               "two products are similar based on your professional knowledge.")


def generate_data(out_path: str, item_dict: dict, test_ratio: float = 0.3):
    data_list = []
    # 构建正样本
    for item in item_dict.keys():

        info = item_dict[item]
        title = info['title']
        brand = info['brand']
        price = info['price']
        description = info['description']
        also_view = info['also_view']
        also_buy = info['also_buy']
        _dict = {
            "title": title,
            "brand": brand,
            "price": price,
            "description": description
        }
        s = set(also_view).union(set(also_buy))
        for i in s:
            if i in item_dict:
                i_dict = {
                    "title": item_dict[i]['title'],
                    "brand": item_dict[i]['brand'],
                    "price": item_dict[i]['price'],
                    "description": item_dict[i]['description']
                }
                positive_sample_pair = [_dict, i_dict]
                formatted_input = json.dumps(positive_sample_pair, indent=4, ensure_ascii=False)
                input = ("I will provide you with two product related introduction information, as follows(in JSON " +
                         "format):\n\n" +
                         formatted_input + "\n\n" +
                         "Based on above information, please predict if these two products are similar. The similarity " +
                         "is between 0 and 2, 0 being lowest and 2 being highest. You just need to ranking the above " +
                         "product, do not explain the reason.")
                if i in also_buy:
                    output = "2"
                else:
                    output = "1"
                res_dic = {
                    "instruction": instruction,
                    "input": input,
                    "output": output
                }
                data_list.append(res_dic)

    # 构建负样本
    positive_sample_num = len(data_list)
    item_set = item_dict.keys()
    for i in range(positive_sample_num):
        negative_sample_pair = random.sample(item_set, 2)  # [1, 2]
        a_dict = {
            "title": item_dict[negative_sample_pair[0]]['title'],
            "brand": item_dict[negative_sample_pair[0]]['brand'],
            "price": item_dict[negative_sample_pair[0]]['price'],
            "description": item_dict[negative_sample_pair[0]]['description']
        }
        b_dict = {
            "title": item_dict[negative_sample_pair[1]]['title'],
            "brand": item_dict[negative_sample_pair[1]]['brand'],
            "price": item_dict[negative_sample_pair[1]]['price'],
            "description": item_dict[negative_sample_pair[1]]['description']
        }
        negative_sample_pair = [a_dict, b_dict]
        formatted_input = json.dumps(negative_sample_pair, indent=4, ensure_ascii=False)
        input = ("I will provide you with two product related introduction information, as follows(in JSON " +
                 "format):\n\n" +
                 formatted_input + "\n\n" +
                 "Based on above information, please predict if these two products are similar. The similarity " +
                 "is between 0 and 2, 0 being lowest and 2 being highest. You just need to ranking the above " +
                 "product, do not explain the reason.")
        res_dic = {
            "instruction": instruction,
            "input": input,
            "output": "0"
        }
        data_list.append(res_dic)

    # 将数据拆分为训练集和测试集
    random.shuffle(data_list)
    split_loc = int(len(data_list) * test_ratio)
    test_data_list = data_list[0: split_loc]
    train_data_list = data_list[split_loc:]
    test_res = json.dumps(test_data_list, indent=4, ensure_ascii=False)
    train_res = json.dumps(train_data_list, indent=4, ensure_ascii=False)
    with open(out_path + "/test.json", 'a') as file_:  # 将生成的训练数据保存起来
        file_.write(test_res)
    with open(out_path + "/train.json", 'a') as file_:  # 将生成的训练数据保存起来
        file_.write(train_res)


from generate_item_dict import get_metadata_dict

item_dict = get_metadata_dict()
generate_data("../data", item_dict, 0.3)

"""
    目前train.json 4616个样本。
    目前test.json 3706个样本。
"""

下面为每个商品生成相关的信息字典。
代码similar_rec/data-process/generate_item_dict.py

import csv
import json


def get_metadata_dict(path: str = '../../data/amazon_review/beauty/meta_All_Beauty.json') -> dict:
    item_dict = {}  # meta_All_Beauty.json中获取每个item对应的信息
    """
    {"category": [], "tech1": "", "description": ["Start Up combines citrus essential oils with gentle Alpha Hydroxy Acids to cleanse and refresh your face. The 5% AHA level is gentle enough for all skin types.", "", ""], 
    "fit": "", "title": "Kiss My Face Exfoliating Face Wash Start Up, 4 Fluid Ounce", 
    "also_buy": ["B000Z96JDI", "B00006IGL8", "B007C5X34G", "B00006IGLF", "B00213WCNC", "B00D1W1QXE", "B001FB5HZG", "B000FQ86RI", "B0012BSKBM", "B0085EVLRO", "B00A2EXVQE"], 
    "tech2": "", "brand": "Kiss My Face", "feature": [], "rank": [], 
    "also_view": ["B000Z96JDI", "B001FB5HZG", "B00213WCNC", "B00BBFOVO4", "B0085EVLRO"], 
    "details": {"\n    Product Dimensions: \n    ": "2.5 x 1.6 x 7 inches ; 4 ounces", "Shipping Weight:": "4 ounces", "ASIN: ": "B00006IGL2", "UPC:": "890795851488 701320351987 601669038184 793379218755 028367831938 787734768894 756769626417", "Item model number:": "1200040"},
     "main_cat": "All Beauty", "similar_item": "", "date": "", "price": "", 
     "asin": "B00006IGL2", "imageURL": ["https://images-na.ssl-images-amazon.com/images/I/41i07fBAznL._SS40_.jpg", 
     "https://images-na.ssl-images-amazon.com/images/I/31W8DZRVD1L._SS40_.jpg"], "imageURLHighRes": ["https://images-na.ssl-images-amazon.com/images/I/41i07fBAznL.jpg", "https://images-na.ssl-images-amazon.com/images/I/31W8DZRVD1L.jpg"]}
    
    """
    with open(path, 'r') as file:
        reader = csv.reader(file, delimiter='\n')
        for row in reader:
            j = json.loads(row[0])
            item_id = j['asin']
            title = j['title']
            brand = j['brand']
            description = j['description']
            price = j['price']
            also_buy = j['also_buy']
            also_view = j['also_view']
            if price != "" and '$' not in price and len(price) > 10:  # 处理一些异常数据情况
                price = ""
            item_info = {
                "title": title,
                "brand": brand,
                "description": description,
                "also_buy": also_buy,
                "also_view": also_view,
                "price": price
            }
            item_dict[item_id] = item_info
    return item_dict

5.2 多路召回

下面是基于：

标签 brand
同时购买also_buy
同时浏览also_view
嵌入相似
四个召回算法的。

代码similar_rec/recall_items.py

import importlib
import operator
import sys

from sentence_transformers import SentenceTransformer

sys.path.append('./data-process')
generate_item_dict = importlib.import_module('generate_item_dict')


def tags_recall(item_id: str, item_dict: dict) -> [str]:
    """
    基于商品的标签召回，本算法利用brand进行召回，召回的item是brand跟item_id一样的商品
    :param item_dict: 商品metadata的字典信息
    :param item_id: 商品id
    :return: 召回的item列表
    """
    brand = item_dict[item_id]['brand']
    recall_list = []
    for key, value in item_dict.items():
        if value['brand'] == brand and key != item_id:
            recall_list.append(key)
    return recall_list


def embedding_recall(item_id: str,
                     item_dict: dict,
                     recall_num: int = 20,
                     min_similar_score: float = 0.8) -> [str]:
    """
    利用商品的文本数据进行嵌入，利用嵌入向量召回
    :param min_similar_score: 最底的相似得分，大于这个得分就可以做为召回了
    :param recall_num: 召回的数量，默认是20个
    :param item_dict: 商品metadata的字典信息
    :param item_id: 商品id
    :return: 召回的item列表
    本函数只是一个方法式例，的实现效率不是很高，更好的实现方式是提前将所有商品的embedding计算出来并且放到faiss库（或者其它向量库）中，
    这样可以获得毫秒级的召回效率
    """
    model = SentenceTransformer('/Users/liuqiang/Desktop/code/llm/models/bge-large-en-v1.5')
    item_title = item_dict[item_id]['title']
    item_desc = item_dict[item_id]['description'][0]
    item_info = "title: " + item_title + "\n" + "description: " + item_desc
    sentences_1 = [item_info]
    embeddings_1 = model.encode(sentences_1, normalize_embeddings=True)
    similar_list = []
    for key, value in item_dict.items():
        if len(similar_list) < recall_num and key != item_id and value['description']:
            title = value['title']
            desc = value['description'][0]
            info = "title: " + title + "\n" + "description: " + desc
            sentences_2 = [info]
            embeddings_2 = model.encode(sentences_2, normalize_embeddings=True)
            similarity = embeddings_1 @ embeddings_2.T
            if similarity[0][0] > min_similar_score:
                similar_list.append((key, similarity[0][0]))
    similar_list.sort(key=operator.itemgetter(1), reverse=True)
    slice_list = similar_list[0: recall_num]
    return [x[0] for x in slice_list]


def also_buy_recall(item_id: str, item_dict: dict) -> [str]:
    """
    亚马逊电商数据集中商品metadata中包含also_buy字段，这个字段就是跟该商品一起买的商品，可以做为召回
    :param item_dict: 商品metadata的字典信息
    :param item_id: 商品id
    :return: 召回的item列表
    """
    also_buy_list = item_dict[item_id]['also_buy']
    return also_buy_list


def also_view_recall(item_id: str, item_dict: dict) -> [str]:
    """
    亚马逊电商数据集中商品metadata中包含also_view字段，这个字段就是跟该商品一起被用户浏览的商品，可以做为召回来源
    :param item_dict: 商品metadata的字典信息
    :param item_id: 商品id
    :return: 召回的item列表
    """
    also_view_list = item_dict[item_id]['also_view']
    return also_view_list


if __name__ == "__main__":
    dic = generate_item_dict.get_metadata_dict()
    print("----------")
    print(embedding_recall("B00006IGL2", dic, 10, 0.75))
    print("----------")

5.3 相似度排序

两种方法：

基于 rerank 模型的排序
基于微调后的大模型的排序

代码similar_rec/similar_ranking.py

import importlib
import json
import operator
import sys

import torch
from sentence_transformers import CrossEncoder
from transformers import AutoTokenizer, AutoModelForCausalLM

sys.path.append('./data-process')
generate_item_dict = importlib.import_module('generate_item_dict')


def cross_encoder_rerank(item_id: str,
                         recall_list: [[str]],
                         item_dict: dict,
                         top_n: int = 10) -> [dict]:
    """

    :param item_id: 待推荐的商品id，我们会给该商品关联相关的商品做为相似推荐
    :param recall_list: 召回的商品列表
    :param item_dict: 商品信息字典
    :param top_n: 最终相似的商品的数量，默认值为10
    :return: 最终排序后的相似结果
    """
    all_recall_items = set()
    for lst in recall_list:
        all_recall_items = all_recall_items.union(set(lst))
    model = CrossEncoder(model_name='/Users/liuqiang/Desktop/code/llm/models/bge-reranker-large',
                         max_length=512, device="mps")

    item_title = item_dict[item_id]['title']
    item_desc = item_dict[item_id]['description'][0]
    item_info = "title: " + item_title + "\n" + "description: " + item_desc
    sentence_list = []
    item_list = []
    for item in all_recall_items:
        if item in item_dict:
            title = item_dict[item]['title']
            desc = item_dict[item]['description'][0]
            info = "title: " + title + "\n" + "description: " + desc
            sentence_list.append(info)
            item_list.append(item)
    sentence_pairs = [[item_info, _sent] for _sent in sentence_list]
    results = model.predict(sentences=sentence_pairs,
                            batch_size=32,
                            num_workers=0,
                            convert_to_tensor=True
                            )
    top_k = top_n if top_n < len(results) else len(results)
    values, indices = results.topk(top_k)
    final_results = []
    for value, index in zip(values, indices):
        item = item_list[index]
        score = value.item()
        doc = {
            "item": item,
            "score": score
        }
        final_results.append(doc)
    return final_results


def llm_rerank(item_id: str,
               recall_list: [[str]],
               item_dict: dict,
               top_n: int = 10,
               model_path: str = './models') -> [dict]:
    """

    :param model_path: 预训练好的模型的存储路径
    :param item_id: 待推荐的商品id，我们会给该商品关联相关的商品做为相似推荐
    :param recall_list: 召回的商品列表
    :param item_dict: 商品信息字典
    :param top_n: 最终相似的商品的数量，默认值为10
    :return: 最终排序后的相似结果
    """
    print(model_path)
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map="auto",
        torch_dtype=torch.float16,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    tokenizer.padding_side = 'right'

    instruction = ("You are a product expert who judges whether "
                   "two products are similar based on your professional knowledge.")

    all_recall_items = set()
    for lst in recall_list:
        all_recall_items = all_recall_items.union(set(lst))

    a_dict = {
        "title": item_dict[item_id]['title'],
        "brand": item_dict[item_id]['brand'],
        "price": item_dict[item_id]['price'],
        "description": item_dict[item_id]['description']
    }

    results = []
    for item in all_recall_items:
        if item in item_dict:
            b_dict = {
                "title": item_dict[item]['title'],
                "brand": item_dict[item]['brand'],
                "price": item_dict[item]['price'],
                "description": item_dict[item]['description']
            }
            sample_pair = [a_dict, b_dict]
            formatted_input = json.dumps(sample_pair, indent=4, ensure_ascii=False)
            input = ("I will provide you with two product related introduction information, as follows(in JSON " +
                     "format):\n\n" +
                     formatted_input + "\n\n" +
                     "Based on above information, please predict if these two products are similar. The similarity " +
                     "is between 0 and 1, 0 being lowest and 1 being highest. You just need to ranking the above " +
                     "product, do not explain the reason.")
            prompt = f"""### Instruction:
            {instruction}

            ### Input:
            {input}

            ### Response:
            """
            input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
            outputs = model.generate(input_ids=input_ids.to('mps'),
                                     max_new_tokens=500, pad_token_id=tokenizer.eos_token_id)
            predict_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0][len(prompt):]
            doc = {
                "item": item,
                "score": float(predict_output)
            }
            results.append(doc)

    sorted_list = sorted(results, key=operator.itemgetter('score'), reverse=True)
    return sorted_list[:top_n]


if __name__ == "__main__":
    embedding_recall = ['B000052YPP', 'B00005308B', 'B0000530HZ', 'B000052YD8', 'B00005308M', 'B000052YMO',
                        '9790787006', '6546546450', '9744914572', '7414204790']
    also_view_recall = ['B000Z96JDI', 'B001FB5HZG', 'B00213WCNC', 'B00BBFOVO4', 'B0085EVLRO']
    also_buy_recall = ['B000Z96JDI', 'B00006IGL8', 'B007C5X34G', 'B00006IGLF', 'B00213WCNC', 'B00D1W1QXE', 'B001FB5HZG',
                       'B000FQ86RI', 'B0012BSKBM', 'B0085EVLRO', 'B00A2EXVQE']
    band_recall = ['B00028EYZW', 'B001E0T0HE', 'B00FTBJ6HI', 'B00KLDU08S', 'B00OQQWU4I']
    dic = generate_item_dict.get_metadata_dict('../data/amazon_review/beauty/meta_All_Beauty.json')

    # print("----------")
    # res = cross_encoder_rerank("B00006IGL2",
    #                            [embedding_recall, also_buy_recall, also_view_recall, band_recall],
    #                            item_dict, 10)
    # print(json.dumps(res, indent=4, ensure_ascii=False))
    # print("----------")

    print("----------")
    res = llm_rerank("B00006IGL2",
                     [embedding_recall, also_buy_recall, also_view_recall, band_recall],
                     dic, 10, './models')
    print(json.dumps(res, indent=4, ensure_ascii=False))
    print("----------")

5.4 效果评估

预测相似度（0-2），利用 RMSE 指标评估
代码similar_rec/evaluate.py

import json
import math

import fire
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM


def is_valid_json(text):
    try:
        json.loads(text)
        return True
    except json.JSONDecodeError:
        return False


def is_float(s):
    try:
        float(s)
        return True
    except ValueError:
        return False


def rmse(_true, _predict):
    return math.sqrt(math.fabs(_true - _predict))


def output_format(output) -> float:
    """
    :param output: 大模型的输出
    :return: int
    没有微调过的大模型输出的可能不是按照规范的，需要获得大模型对应的输出，下面是4个大模型的输出案例。
    1.  The similarity between these two products is 0.5.
    2. The similarity between the two products is 0.5. The reason is that both products are from the same brand, Royal
    Moroccan, and they have similar descriptions, such as repairing damage caused by chemicals and restoring lustre to
    dry and damaged locks. However, the price and capacity of the two products are different, which may affect the similarity score.
    3. 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
    4.  [
            {
                "product\_1": "Shiseido Pureness Moisturizing Gel (Oil Free) 50ml/1.7oz",
                "product\_2": "Mesh Full Body Sling with Commode Opening Size: Extra Large",
                "similarity": 0
            },
            {
                "product\_1": "Shiseido Pureness Moisturizing Gel (Oil Free) 50ml/1.7oz",
                "product\_2": "Mesh Full Body Sling with Commode Opening Size: Large",
                "similarity": 0
            }
        ]
    """
    output = output[:1000]  # 只取前面的1000个字符，后面如果有生成额外的不考虑
    if is_float(output[0:2]):
        return float(output[0:2])
    string_1 = 'The similarity between the two products is '
    string_2 = 'The similarity between these two products is '
    string_3 = '"similarity": '
    index_1 = output.find(string_1)  # 如果找不到，返回-1
    index_2 = output.find(string_2)  # 如果找不到，返回-1
    index_3 = output.find(string_3)  # 如果找不到，返回-1
    if index_1 > -1:
        if is_float(output[index_1 + len(string_1):index_1 + len(string_1) + 2]):
            score = float(output[index_1 + len(string_1):index_1 + len(string_1) + 2])
            return score
        else:
            return -1
    elif index_2 > -1:
        if is_float(output[index_2 + len(string_2):index_2 + len(string_2) + 2]):
            score = float(output[index_2 + len(string_2):index_2 + len(string_2) + 2])
            return score
        else:
            return -1
    elif index_3 > -1:
        if is_float(output[index_3 + len(string_3):index_3 + len(string_3) + 2]):
            score = float(output[index_3 + len(string_3):index_3 + len(string_3) + 2])
            return score
        else:
            return -1
    else:
        return -1


def load_model_token(model_path: str) -> (AutoModelForCausalLM, AutoTokenizer):
    # Load model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map="auto",
        torch_dtype=torch.float16,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    tokenizer.padding_side = 'right'  # to prevent warnings
    return model, tokenizer


def evaluate(model_path: str,
             test_data_path: str = './data',
             keep_sample_num: int = 10) -> float:
    model, tokenizer = load_model_token(model_path)

    dataset_dict = load_dataset(test_data_path)
    test_dataset = dataset_dict['test'][0:keep_sample_num]

    acc_rmse = 0.0  # 累积误差
    acc_num = 0  # 累积的参与统计的样本数量

    for i in range(keep_sample_num):
        prompt = f"""### Instruction:
        {test_dataset['instruction'][i]}
        
        ### Input:
        {test_dataset['input'][i]}
        
        ### Response:
        """
        gold_output = float(test_dataset['output'][i])
        input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
        outputs = model.generate(input_ids=input_ids.to('mps'),
                                 max_new_tokens=500, pad_token_id=tokenizer.eos_token_id)
        predict_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0][len(prompt):]
        print("--------sample: " + str(i) + "--------")
        print("预测的输出: " + predict_output + "\n")
        score = output_format(predict_output)
        print("预测的score: " + str(score) + "\n")
        if score == -1:  # 生成的比较复杂，没有解析到对应的预测评分
            continue
        else:
            acc_num += 1
            predict_output = score
            rmse_ = rmse(gold_output, predict_output)
            acc_rmse += rmse_
            dic = {  # 将每一个样本的评估结果打印出来
                "sample": i,
                "input": test_dataset['input'][i],
                "gold_output": gold_output,
                "predict_output": predict_output,
                "rmse": rmse_
            }
            print(json.dumps(dic, indent=4, ensure_ascii=False))
            print("----------------")

    return acc_rmse / acc_num


def effect_comparison(base_model_path: str = '/Users/liuqiang/Desktop/code/llm/models/Qwen1.5-4B',
                      finetune_model_path: str = './models',
                      test_data_path: str = './data',
                      keep_sample_num: int = 10):
    avg_base_rmse = evaluate(base_model_path, test_data_path, keep_sample_num)
    avg_finetune_rmse = evaluate(finetune_model_path, test_data_path, keep_sample_num)

    print("基底模型的平均rmse：" + str(avg_base_rmse))
    print("微调模型的平均rmse：" + str(avg_finetune_rmse))


if __name__ == "__main__":
    fire.Fire(effect_comparison)