生成范式

大模型生成特征、训练数据与物品

1、大模型生成嵌入特征

1.1 利用 sentence-transformers 框架嵌入

见下方代码

"""
利用sentence_transformers框架来实现一个最简单的个性化推荐：
1. 用户嵌入：用户浏览过的新闻的嵌入的平均值
2. 预测：利用用户嵌入与新闻嵌入的cosine余弦
"""

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util

col_spliter = "\t"
DIMS = 384  # all-MiniLM-L6-v2 模型的维数
TOP_N = 10  # 为每个用户生成10个新闻推荐

df_news = pd.read_csv("./data/mind/MINDsmall_train/news.tsv", sep=col_spliter)
df_news.columns = ['news_id', 'category', 'subcategory', 'title', 'abstract', 'url',
                   'title_entity', 'abstract_entity']

df_behavior = pd.read_csv("./data/mind/MINDsmall_train/behaviors.tsv", sep=col_spliter)
df_behavior.columns = ['impression_id', 'user_id', 'time', 'click_history', 'news']

model = SentenceTransformer('all-MiniLM-L6-v2')

# 获取每个新闻及对应的嵌入向量
news_embeddings = {}
for _, row in df_news.iterrows():
    news_id = row['news_id']
    title = row['title']
    embedding = model.encode(title)
    news_embeddings[news_id] = embedding


def rec_4_one_user(click_history):
    """
    为单个用户生成 TOP_N 个推荐
    """
    emb = np.zeros(DIMS, dtype=float)
    for news in click_history:
        emb = np.add(emb, news_embeddings[news])
    emb = emb / len(click_history)
    emb = emb.astype(np.float32)
    res = []
    for news_id, emb_ in news_embeddings.items():
        cos_sim = float(util.cos_sim(emb, emb_)[0][0])
        res.append((news_id, cos_sim))
    rec = sorted(res, key=lambda x: x[1], reverse=True)[:TOP_N]
    return rec


"""
为所有用户生成推荐
"""
user_rec = {}
for _, row in df_behavior.iterrows():
    user_id = row['user_id']
    click_history = row['click_history'].split(' ')
    rec = rec_4_one_user(click_history)
    user_rec[user_id] = rec

最终的输出结果基本符合要求，但是可能会在格式上出一点问题。

1.2 其他嵌入方法

UNBERT
PREC

2、大模型生成文本特征

2.1 生成新闻标题

import os
import time
import pandas as pd
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
from tqdm import tqdm

MIN_INTERVAL = 1.5

# 新闻数据中包含的字段说明
keys = dict(
    title='title',
    abstract='abs',
    category='cat',
    subcategory='subcat',
)

current_path = os.getcwd()

# 将新闻读到dataframe中
news_df = pd.read_csv(
    filepath_or_buffer=os.path.join(current_path + '/data/news.tsv'),
    sep='\t',
    header=0,
)

# 构建新闻列表，每一个元素是元组，元组前面是新闻id，后面是dict，dict是新闻相关信息, 下面是一条数据样本
# ('N55528', {'title': 'The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By',
#             'abstract': "Shop the notebooks, jackets, and more that the royals can't live without.",
#             'category': 'lifestyle', 'subcategory': 'lifestyleroyals', 'newtitle': ''})
news_list = []
for news in tqdm(news_df.iterrows()):
    dic = {}
    for key in keys:
        dic[key] = news[1][keys[key]]
    news_list.append((news[1]['nid'], dic))

# 提示词模板
prompt_template = """You are asked to act as a news title enhancer. I will provide you a piece of news, with its original title, category, subcategory, and abstract (if exists). The news format is as below:

[title] {title}
[abstract] {abstract}
[category] {category}
[subcategory] {subcategory}

where title, abstract, category, and subcategory in the brace will be filled with content. You can only response a rephrased news title which should be clear, complete, objective and neutral. You can expand the title according to the above requirements. You are not allowed to response any other words for any explanation. Your response format should be:

[newtitle] 

where [newtitle] should be filled with the enhanced title. Now, your role of news title enhancer formally begins. Any other information should not disturb your role."""

# 生成的新的新闻标题的存储路径
save_path = current_path + '/output/news_summarizer.log'

# 下面是调用LLAMA大模型的语法
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
llm = LlamaCpp(
    model_path="/Users/liuqiang/Desktop/code/llm/models/gguf/qwen1.5-72b-chat-q5_k_m.gguf",
    temperature=0.8,
    top_p=0.8,
    n_ctx=6000,
    callback_manager=callback_manager,
    verbose=True,
    # stop=["<|im_end|>"]  # 生成的答案中遇到这些词就停止生成
)

"""
PromptTemplate 与 chain的使用案例：
（1）文本的使用方式
prompt = PromptTemplate(
    input_variables=["product"],
    template="What is a good name for a company that makes {product}?",
)

chain = LLMChain(llm=llm, prompt=prompt)
# Run the chain only specifying the input variable.
print(chain.run("colorful socks"))

（2）字典的使用方式
prompt = PromptTemplate(
    input_variables=["company", "product"],
    template="What is a good name for {company} that makes {product}?",
)
chain = LLMChain(llm=llm, prompt=prompt)
print(chain.run({
    'company': "ABC Startup",
    'product': "colorful socks"
    }))
"""

prompt = PromptTemplate(
    input_variables=["title", "abstract", "category", "subcategory"],
    template=prompt_template,
)
chain = LLMChain(llm=llm, prompt=prompt)

# 先统计出哪些已经计算了，避免后面重复计算
exist_set = set()
with open(save_path, 'r') as f:
    for line in f:
        if line and line.startswith('N'):
            exist_set.add(line.split('\t')[0])

# 调用大模型迭代计算新闻标题
for nid, content in tqdm(news_list):
    start_time = time.time()
    if nid in exist_set:
        continue
    try:
        title = content['title']
        abstract = content['abstract']
        category = content['category']
        subcategory = content['subcategory']
        enhanced = chain.run(title=title, abstract=abstract, category=category, subcategory=subcategory)
        enhanced = enhanced.rstrip('\n')
        with open(save_path, 'a') as f:
            f.write(f'{nid}\t{enhanced}\n')
    except Exception as e:
        print(e)

    interval = time.time() - start_time
    if interval <= MIN_INTERVAL:
        time.sleep(MIN_INTERVAL - interval)

2.2 生成用户兴趣画像

文件prompter.py

import json
import os

import pandas as pd
from UniTok import UniDep
from tqdm import tqdm


class MindPrompter:
    def __init__(self, data_path):
        self.data_path = data_path

        self.news_df = pd.read_csv(
            filepath_or_buffer=os.path.join(data_path),
            sep='\t',
            header=0,
        )

        self.keys = dict(
            title='title',
            abstract='abs',
            category='cat',
            subcategory='subcat',
        )

        self._news_list = None
        self._news_dict = None

    def stringify(self):
        if self._news_list is not None:
            return self._news_list
        self._news_list = []
        for news in tqdm(self.news_df.iterrows()):
            string = ''
            for key in self.keys:
                string += f'[{key}] {news[1][self.keys[key]]}\n'
            self._news_list.append((news[1]['nid'], string))
        return self._news_list

    def get_news_dict(self):
        if self._news_dict is not None:
            return self._news_dict
        self._news_dict = {}
        for news in tqdm(self.news_df.iterrows()):
            self._news_dict[news[1]['nid']] = news[1]['title']
        return self._news_dict

    def get_news_dict_with_category(self):
        if self._news_dict is not None:
            return self._news_dict
        self._news_dict = {}
        for news in tqdm(self.news_df.iterrows()):
            self._news_dict[news[1]['nid']] = f'({news[1]["cat"]}) {news[1]["title"]}'
        return self._news_dict


class MindUser:
    def __init__(self, data_path, mind_prompter):
        self.depot = UniDep(data_path, silent=True)
        self.nid = self.depot.vocabs('nid')
        self.news_dict = mind_prompter.get_news_dict()

        self._user_list = None

    def stringify(self):
        if self._user_list is not None:
            return self._user_list
        self._user_list = []
        for user in tqdm(self.depot):
            string = ''
            if not user['history']:
                self._user_list.append((user['uid'], None))
            for i, n in enumerate(user['history']):
                string += f'({i + 1}) {self.news_dict[self.nid.i2o[n]]}\n'
            self._user_list.append((user['uid'], string))
        return self._user_list


class MindColdUser:
    def __init__(self, data_path, mind_prompter):
        self.depot = UniDep(data_path, silent=True)
        self.nid = self.depot.vocabs('nid')
        self.news_dict = mind_prompter.get_news_dict_with_category()

        self._user_list = None

    def stringify(self):
        if self._user_list is not None:
            return self._user_list
        self._user_list = []
        for user in tqdm(self.depot):
            string = ''
            if not user['history'] or len(user['history']) > 5:
                continue
            for i, n in enumerate(user['history']):
                string += f'({i + 1}) {self.news_dict[self.nid.i2o[n]]}\n'
            self._user_list.append((user['uid'], string))
        return self._user_list


class MindCoT:
    def __init__(self, data_path, plugin_path, mind_prompter, allowed_user_path):
        self.depot = UniDep(data_path, silent=True)
        self.plugin = UniDep(plugin_path, silent=True)
        self.tv = self.plugin.vocabs['topic']
        self.rv = self.plugin.vocabs['region']

        self.nid = self.depot.vocabs('nid')
        self.news_dict = mind_prompter.get_news_dict_with_category()

        self._user_list = None
        self.allowed_user = json.load(open(allowed_user_path))

    def stringify(self):
        if self._user_list is not None:
            return self._user_list
        self._user_list = []
        for user in tqdm(self.depot):
            if user['uid'] not in self.allowed_user:
                continue
            string = ''
            pg = self.plugin[user['uid']]
            string += 'Interest Topics:\n'
            for t in pg['topic']:
                string += f'- {self.tv[t]}\n'
            string += '\n'
            # string += 'Interest Regions:\n'
            # for r in pg['region']:
            #     string += f'- {self.rv[r]}\n'
            # string += '\n'

            string += 'History:\n'
            for i, n in enumerate(user['history']):
                string += f'({i + 1}) {self.news_dict[self.nid.i2o[n]]}\n'
            self._user_list.append((user['uid'], string))
        return self._user_list

主文件user_portrait.py

import json
import os
import time
import pandas as pd
from UniTok import UniDep
from langchain.chains import LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import LlamaCpp
from langchain_core.prompts import PromptTemplate
from tqdm import tqdm
from prompter import MindPrompter, MindUser

MIN_INTERVAL = 0

current_path = os.getcwd()

mind_prompter = MindPrompter(current_path + '/data/news.tsv')
user_list = MindUser(current_path + '/data/user', mind_prompter).stringify()

# 将新闻读到dataframe中
news_df = pd.read_csv(
    filepath_or_buffer=os.path.join(current_path + '/data/news.tsv'),
    sep='\t',
    header=0,
)

# 构建新闻字典，key为新闻id，value为新闻的title
news_dict = {}
for news in tqdm(news_df.iterrows()):
    news_dict[news[1]['nid']] = news[1]['title']

depot = UniDep(current_path + '/data/user', silent=True)
nid = depot.vocabs('nid')

# 生成每个用户及他看过的新闻数据，下面是一条案例
# (0, ["'Wheel Of Fortune' Guest Delivers Hilarious, Off The Rails Introduction",
# "Three takeaways from Yankees' ALCS Game 5 victory over the Astros",
# "Rosie O'Donnell: Barbara Walters Isn't 'Up to Speaking to People' Right Now",
# "Four flight attendants were arrested in Miami's airport after bringing in thousands in cash, police say",
# 'Michigan sends breakup tweet to Notre Dame as series goes on hold',
# "This Wedding Photo of a Canine Best Man Captures Just How Deep a Dog's Love Truly Is",
# "Robert Evans, 'Chinatown' Producer and Paramount Chief, Dies at 89",
# 'Former US Senator Kay Hagan dead at 66',
# 'Joe Biden reportedly denied Communion at a South Carolina church because of his stance on abortion'])
# user_list = []
# for user in tqdm(depot):
#     list = []
#     if not user['history']:
#         user_list.append((user['uid'], None))
#     for i, n in enumerate(user['history']):
#         list.append(news_dict[nid.i2o[n]])
#     user_list.append((user['uid'], list))

# 提示词模板
system = """You are asked to describe user interest based on his/her browsed news title list, the format of which is as below:

{input}

You can only response the user interests with the following format to describe the [topics] and [regions] of the user's interest

[topics]
- topic1
- topic2
...
[region] (optional)
- region1
- region2
...

where topic is limited to the following options: 

(1) health
(2) education
(3) travel
(4) religion
(5) culture
(6) food
(7) fashion
(8) technology
(9) social media
(10) gender and sexuality
(11) race and ethnicity
(12) history
(13) economy
(14) finance
(15) real estate
(16) transportation
(17) weather
(18) disasters
(19) international news

and the region should be limited to each state of the US.

Only [topics] and [region] can be appeared in your response. If you think region are hard to predict, leave it blank. Your response topic/region list should be ordered, that the first several options should be most related to the user's interest. You are not allowed to response any other words for any explanation or note. Now, the task formally begins. Any other information should not disturb you."""

# 生成的用户兴趣画像的存储路径
save_path = current_path + '/output/user_profiler.log'

# 下面是调用LLAMA大模型的语法
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
llm = LlamaCpp(
    model_path="/Users/liuqiang/Desktop/code/llm/models/gguf/qwen1.5-72b-chat-q5_k_m.gguf",
    temperature=0.8,
    top_p=0.8,
    n_ctx=6000,
    callback_manager=callback_manager,
    verbose=True,
)

# 先统计出哪些已经计算了，避免后面重复计算
exist_set = set()
with open(save_path, 'r') as f:
    for line in f:
        data = json.loads(line)
        exist_set.add(data['uid'])

empty_count = 0

# 调用大模型迭代计算用户兴趣画像
for uid, content in tqdm(user_list):
    start_time = time.time()
    if uid in exist_set:
        continue

    if not content:
        empty_count += 1
        continue

    try:
        prompt = PromptTemplate(
            input_variables=["input"],
            template=system,
        )
        chain = LLMChain(llm=llm, prompt=prompt)
        enhanced = chain.run(input=content)
        enhanced = enhanced.rstrip('\n')
        with open(save_path, 'a') as f:
            f.write(json.dumps({'uid': uid, 'interest': enhanced}) + '\n')
    except Exception as e:
        print(e)

    interval = time.time() - start_time
    if interval <= MIN_INTERVAL:
        time.sleep(MIN_INTERVAL - interval)

print('empty count: ', empty_count)

输出结果满足要求，但是会发现可能有 topic 根本不在要求范围内（大模型产生的幻觉）
可能需要优化提示词等。

2.3 生成文本特征的其他方法

RLMRec 生成物品与用户画像
PALR 框架生成用户兴趣画像

3、大模型生成训练数据

3.1 直接生成表格类数据

方法：GReaT

3.2 生成监督样本数据

方法：基于开源的数据集，通过 GPT-4 获得对应的答案，即获得<输入,输出> 样本对，就是可用于微调的监督样本。
（实际使用时，还需借助人工抽查、规则策略、置信度评分等手段提升质量）

4、大模型生成待推荐物品

4.1 生成个性化新闻

生成用户喜欢的新闻类型、标题
基于 1 生成满足用户个性化需求的新闻

import json
import os
import time
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import LlamaCpp
from tqdm import tqdm

MIN_INTERVAL = 0
current_path = os.getcwd()

save_path = current_path + '/output/personalized_news.log'

prompt_template = """
you are a news writing expert, The information of the news a user browsed are as follows:

"news": {news}

The news in the curly braces above are a list of all the news that the user has browsed, which may contain multiple news articles.
the information in the news stand for the category of news the user likes.
Now please write a new for the user, the news must relevant to the interest of the user, the news you write must less than 300 words.

"""

# 下面是调用LLAMA大模型的语法
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
llm = LlamaCpp(
    model_path="/Users/liuqiang/Desktop/code/llm/models/gguf/qwen1.5-72b-chat-q5_k_m.gguf",
    temperature=0.8,
    top_p=0.8,
    n_ctx=6000,
    callback_manager=callback_manager,
    verbose=True,
)

prompt = PromptTemplate(
    input_variables=["news"],
    template=prompt_template,
)
chain = LLMChain(llm=llm, prompt=prompt)

# 先统计出哪些已经计算了，避免后面重复计算
exist_set = set()
with open(save_path, 'r') as f:
    for line in f:
        data = json.loads(line)
        exist_set.add(data['uid'])

# 打开文件并创建文件对象
file = open(current_path + '/output/personalized_news_summary.log', "r")

# 使用 readlines() 方法将文件内容按行存入列表 lines
lines = file.readlines()

# 关闭文件
file.close()

# 输出文件内容
for line in tqdm(lines):
    info = eval(line)
    uid = info['uid']
    news = info['news']

    start_time = time.time()
    if uid in exist_set:
        continue

    if not news:
        continue

    try:
        enhanced = chain.run(news)
        enhanced = enhanced.rstrip('\n')
        news_ = {"uid": uid, "news": enhanced}
        with open(save_path, 'a') as f:
            f.write(f'{str(news_)}\n')
    except Exception as e:
        print(e)

    interval = time.time() - start_time
    if interval <= MIN_INTERVAL:
        time.sleep(MIN_INTERVAL - interval)