1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
| import pandas as pd data_path = '../examples/ranking/data/ali-ccp' df_train = pd.read_csv(data_path + '/ali_ccp_train_sample.csv') df_val = pd.read_csv(data_path + '/ali_ccp_val_sample.csv') df_test = pd.read_csv(data_path + '/ali_ccp_test_sample.csv') print("train : val : test = %d %d %d" % (len(df_train), len(df_val), len(df_test)))
print(df_train.head(5))
train_idx, val_idx = df_train.shape[0], df_train.shape[0] + df_val.shape[0] data = pd.concat([df_train, df_val, df_test], axis=0)
data.rename(columns={'purchase': 'cvr_label', 'click': 'ctr_label'}, inplace=True) data["ctcvr_label"] = data['cvr_label'] * data['ctr_label']
from torch_rechub.models.multi_task import ESMM from torch_rechub.basic.features import DenseFeature, SparseFeature
col_names = data.columns.values.tolist() dense_cols = ['D109_14', 'D110_14', 'D127_14', 'D150_14', 'D508', 'D509', 'D702', 'D853'] sparse_cols = [col for col in col_names if col not in dense_cols and col not in ['cvr_label', 'ctr_label', 'ctcvr_label']] print("sparse cols:%d dense cols:%d" % (len(sparse_cols), len(dense_cols))) label_cols = ['cvr_label', 'ctr_label', "ctcvr_label"] used_cols = sparse_cols item_cols = ['129', '205', '206', '207', '210', '216'] user_cols = [col for col in used_cols if col not in item_cols] user_features = [SparseFeature(col, data[col].max() + 1, embed_dim=16) for col in user_cols] item_features = [SparseFeature(col, data[col].max() + 1, embed_dim=16) for col in item_cols]
model = ESMM(user_features, item_features, cvr_params={"dims": [16, 8]}, ctr_params={"dims": [16, 8]})
from torch_rechub.utils.data import DataGenerator
x_train, y_train = {name: data[name].values[:train_idx] for name in used_cols}, data[label_cols].values[:train_idx] x_val, y_val = {name: data[name].values[train_idx:val_idx] for name in used_cols}, data[label_cols].values[train_idx:val_idx] x_test, y_test = {name: data[name].values[val_idx:] for name in used_cols}, data[label_cols].values[val_idx:] dg = DataGenerator(x_train, y_train) train_dataloader, val_dataloader, test_dataloader = dg.generate_dataloader(x_val=x_val, y_val=y_val, x_test=x_test, y_test=y_test, batch_size=1024)
import torch import os from torch_rechub.trainers import MTLTrainer device = 'cuda' if torch.cuda.is_available() else 'cpu' learning_rate = 1e-3 epoch = 1 weight_decay = 1e-5 save_dir = '../examples/ranking/data/ali-ccp/saved' if not os.path.exists(save_dir): os.makedirs(save_dir) task_types = ["classification", "classification"] mtl_trainer = MTLTrainer(model, task_types=task_types, optimizer_params={"lr": learning_rate, "weight_decay": weight_decay}, n_epoch=epoch, earlystop_patience=1, device=device, model_path=save_dir) mtl_trainer.fit(train_dataloader, val_dataloader) auc = mtl_trainer.evaluate(mtl_trainer.model, test_dataloader) print(f'test auc: {auc}')
|