import numpy as np import pandas as pd import torch from torch_rechub.models.ranking import WideDeep, DeepFM, DCN from torch_rechub.trainers import CTRTrainer from torch_rechub.basic.features import DenseFeature, SparseFeature from torch_rechub.utils.data import DataGenerator from tqdm import tqdm from sklearn.preprocessing import MinMaxScaler, LabelEncoder torch.manual_seed(2026) #固定随机种子
data_path = '../examples/ranking/data/criteo/criteo_sample.csv' data = pd.read_csv(data_path) #data = pd.read_csv(data_path, compression="gzip") #if the raw_data is .gz file # data.head()
dense_cols= [f for f in data.columns.tolist() if f[0] == "I"] #以I开头的特征名为dense特征 sparse_cols = [f for f in data.columns.tolist() if f[0] == "C"] #以C开头的特征名为sparse特征
#criteo比赛冠军分享的一种离散化思路,不用纠结其原理,大家也可以试试别的离散化手段 defconvert_numeric_feature(val): v = int(val) if v > 2: returnint(np.log(v)**2) else: return v - 2 for col in tqdm(dense_cols): #将离散化dense特征列设置为新的sparse特征列 sparse_cols.append(col + "_sparse") data[col + "_sparse"] = data[col].apply(lambda x: convert_numeric_feature(x))
for col in tqdm(sparse_cols): #sparse特征编码 lbe = LabelEncoder() data[col] = lbe.fit_transform(data[col])
#重点:将每个特征定义为torch-rechub所支持的特征基类,dense特征只需指定特征名,sparse特征需指定特征名、特征取值个数(vocab_size)、embedding维度(embed_dim) dense_features = [DenseFeature(feature_name) for feature_name in dense_cols] sparse_features = [SparseFeature(feature_name, vocab_size=data[feature_name].nunique(), embed_dim=16) for feature_name in sparse_cols] y = data["label"] del data["label"] x = data