来源:Kaggle 官方课程 | Intermediate Machine Learning
2024-08-31@isSeymour

Intermediate Machine Learning

1. Missing Value 缺失值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

# 数据准备
X_full = pd.read_csv('../input/train.csv', index_col='Id')
X_test_full = pd.read_csv('../input/test.csv', index_col='Id')

X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
random_state=0)
# 缺失值情况
print("缺失值情况:")
print(X_train.shape)
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])


# 评估函数
def score_dataset(X_train, X_valid, y_train, y_valid):
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
preds = model.predict(X_valid)
return mean_absolute_error(y_valid, preds)

# 方法 1:丢弃缺失值
print('-'*25+"\n方法 1:丢弃缺失值")
cols_with_missing = [col for col in X.columns
if X[col].isnull().any()]
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)
print("MAE (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

# 方法 2:模糊填充
print('-'*25+"\n方法 2:模糊填充")
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
print("MAE (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

# 方法 3:扩展模糊填充
print('-'*25+"\n方法 3:扩展模糊填%hist")
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()
for col in cols_with_missing:
X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns
print("MAE (An Extension to Imputation):")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))
缺失值情况:
(1168, 36)
LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64
-------------------------
方法 1:丢弃缺失值
MAE (Drop columns with missing values):
17837.82570776256
-------------------------
方法 2:模糊填充
MAE (Imputation):
18062.894611872147
-------------------------
方法 3:扩展模糊填%hist
MAE (An Extension to Imputation):
18148.417180365297

2. Categorical Variables 分类变量

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# 数据准备
X = pd.read_csv('../input/train.csv', index_col='Id')
X_test = pd.read_csv('../input/test.csv', index_col='Id')

X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)

X_train, X_valid, y_train, y_valid = train_test_split(X, y,
train_size=0.8, test_size=0.2,
random_state=0)
# 预览
print("Unique values in 'Condition2' column in training data:\n", X_train['Condition2'].unique())
print("\nUnique values in 'Condition2' column in validation data:\n", X_valid['Condition2'].unique())

# 评估函数
def score_dataset(X_train, X_valid, y_train, y_valid):
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
preds = model.predict(X_valid)
return mean_absolute_error(y_valid, preds)

# 方法 1:丢弃分类变量
print('-'*25+"\n方法 1:丢弃分类变量")
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print("MAE (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))


# 方法 2:顺序编码
print('-'*25+"\n方法 2:顺序编码")
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
good_label_cols = [col for col in object_cols if
set(X_valid[col]).issubset(set(X_train[col]))]
bad_label_cols = list(set(object_cols)-set(good_label_cols))

label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)

ordinal_encoder = OrdinalEncoder()
label_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])
label_X_valid[good_label_cols] = ordinal_encoder.transform(X_valid[good_label_cols])

print("MAE (Ordinal Encoding):")
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))


# 方法 3:独热编码
print('-'*25+"\n方法 3:独热编码")
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))

OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

OH_X_train.columns = OH_X_train.columns.astype('str')
OH_X_valid.columns = OH_X_valid.columns.astype('str')

print("MAE (One-Hot Encoding):")
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))
Unique values in 'Condition2' column in training data:
 ['Norm' 'PosA' 'Feedr' 'PosN' 'Artery' 'RRAe']

Unique values in 'Condition2' column in validation data:
 ['Norm' 'RRAn' 'RRNn' 'Artery' 'Feedr' 'PosN']
-------------------------
方法 1:丢弃分类变量
MAE (Drop categorical variables):
17837.82570776256
-------------------------
方法 2:顺序编码
MAE (Ordinal Encoding):
17098.01649543379
-------------------------
方法 3:独热编码
MAE (One-Hot Encoding):
17525.345719178084

3. Pipeline 管道函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# 数据准备
X_full = pd.read_csv('../input/train.csv', index_col='Id')
X_test_full = pd.read_csv('../input/test.csv', index_col='Id')

X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y,
train_size=0.8, test_size=0.2,
random_state=0)

categorical_cols = [cname for cname in X_train_full.columns if
X_train_full[cname].nunique() < 10 and
X_train_full[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train_full.columns if
X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()


# 数据预处理
numerical_transformer = SimpleImputer(strategy='constant')
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])

# 模型定义
model = RandomForestRegressor(n_estimators=100, random_state=0)

# 预处理与模型集成 Pipeline
clf = Pipeline(steps=[
('preprocessor', preprocessor),
('model', model)
])
clf.fit(X_train, y_train)

# 预测也不必再单独对 test 数据集预处理了
preds = clf.predict(X_valid)
print('MAE:', mean_absolute_error(y_valid, preds))

# 导出
preds_test = clf.predict(X_test)
output = pd.DataFrame({'Id': X_test.index,
'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)
print("Save to : submission.csv")
MAE: 17614.81993150685
Save to : submission.csv

4. Cross-Validation 交叉验证

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
%matplotlib inline

# 数据准备
train_data = pd.read_csv('../input/train.csv', index_col='Id')
test_data = pd.read_csv('../input/test.csv', index_col='Id')

train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = train_data.SalePrice
train_data.drop(['SalePrice'], axis=1, inplace=True)

numeric_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ['int64', 'float64']]
X = train_data[numeric_cols].copy()
X_test = test_data[numeric_cols].copy()

my_pipeline = Pipeline(steps=[
('preprocessor', SimpleImputer()),
('model', RandomForestRegressor(n_estimators=50, random_state=0))
])

# 评估函数
def get_score(n_estimators):
"""Return the average MAE over 3 CV folds of random forest model.

Keyword argument:
n_estimators -- the number of trees in the forest
"""
# Replace this body with your own code
pipeline_tmp = Pipeline(steps=[
("preprocessor", SimpleImputer()),
("model", RandomForestRegressor(n_estimators=n_estimators, random_state=0))
])
# 交叉验证 - 得分
scores_tmp = -1 * cross_val_score(pipeline_tmp, X, y,
cv=3,
scoring="neg_mean_absolute_error")
print(f"{n_estimators} estimators MAE: {scores_tmp}")
return scores_tmp.mean()

# 测试不同的 n_estimators
results = {}
for i in range(1, 9):
results[50*i] = get_score(50*i)

# 结果绘图
plt.plot(list(results.keys()), list(results.values()))
plt.show()
50 estimators MAE: [18414.4198768  17950.8061191  18696.29205761]
100 estimators MAE: [18431.44531143 17816.34640657 18937.85378601]
150 estimators MAE: [18243.49104266 17661.60951403 18961.08950617]
200 estimators MAE: [18245.33884668 17671.76715606 18827.93166667]
250 estimators MAE: [18282.77936756 17578.16626694 18904.86203292]
300 estimators MAE: [18251.90511294 17600.66503765 18973.15561728]
350 estimators MAE: [18245.98694241 17586.13852156 18978.75003527]
400 estimators MAE: [18245.06112423 17553.22719713 19012.30560185]

png

5. XGBoost 梯度增强

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# 数据准备
X = pd.read_csv('../input/train.csv', index_col='Id')
X_test_full = pd.read_csv('../input/test.csv', index_col='Id')

X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
random_state=0)

low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and
X_train_full[cname].dtype == "object"]

numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = low_cardinality_cols + numeric_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

# 独热编码(使用 pandas 方便)
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

# 模型 1:默认 XGBoost
my_model_1 = XGBRegressor(random_state=0)
my_model_1.fit(X_train, y_train)
predictions_1 = my_model_1.predict(X_valid)
mae_1 = mean_absolute_error(y_valid, predictions_1)
print("Model 1 MAE:" , mae_1)


# 模型 2:自定义参数 XGBoost
my_model_2 = XGBRegressor(n_estimators=500, learning_rate=0.05)
my_model_2.fit(X_train, y_train)
predictions_2 = my_model_2.predict(X_valid)
mae_2 = mean_absolute_error(y_valid, predictions_2)
print("Model 2 MAE:" , mae_2)


Model 1 MAE: 18161.82412510702
Model 2 MAE: 17101.580024614726

6. Data Leakage 数据泄露

有以下两种:

  1. target Leakage 标签泄露
  2. train-test contamination 数据集污染
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# 数据准备
data = pd.read_csv('../input/AER_credit_card_data.csv',
true_values = ['yes'], false_values = ['no'])
y = data.card
X = data.drop(['card'], axis=1)
print("Number of rows in the dataset:", X.shape[0])
my_pipeline = make_pipeline(RandomForestClassifier(n_estimators=100))
cv_scores = cross_val_score(my_pipeline, X, y,
cv=5,
scoring='accuracy')
# 准确度非常高
print("Cross-val accuracy: %f" % cv_scores.mean())

# 查看情况
print('-'*25)
expenditures_cardholders = X.expenditure[y]
expenditures_noncardholders = X.expenditure[~y]
print('Fraction of those who did not receive a card and had no expenditures: %.2f' \
%((expenditures_noncardholders == 0).mean()))
print('Fraction of those who received a card and had no expenditures: %.2f' \
%(( expenditures_cardholders == 0).mean()))

# 实际上,卡支出数据 expenditures 暴露了是否有银行卡,
# 而且这个数据必然是有银行卡才会有卡支出,因此 标签泄露
# 改正:丢弃 卡支出数据,以及其他标签泄露的数据
print('-'*25)
potential_leaks = ['expenditure', 'share', 'active', 'majorcards']
X2 = X.drop(potential_leaks, axis=1)
cv_scores = cross_val_score(my_pipeline, X2, y,
cv=5,
scoring='accuracy')
print("Cross-val accuracy: %f" % cv_scores.mean())
Number of rows in the dataset: 1319
Cross-val accuracy: 0.978779
-------------------------
Fraction of those who did not receive a card and had no expenditures: 1.00
Fraction of those who received a card and had no expenditures: 0.02
-------------------------
Cross-val accuracy: 0.834719
1