import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error from sklearn.impute import SimpleImputer
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error from sklearn.preprocessing import OrdinalEncoder from sklearn.preprocessing import OneHotEncoder
# 数据准备 X = pd.read_csv('../input/train.csv', index_col='Id') X_test = pd.read_csv('../input/test.csv', index_col='Id')
X.dropna(axis=0, subset=['SalePrice'], inplace=True) y = X.SalePrice X.drop(['SalePrice'], axis=1, inplace=True)
cols_with_missing = [col for col in X.columns if X[col].isnull().any()] X.drop(cols_with_missing, axis=1, inplace=True) X_test.drop(cols_with_missing, axis=1, inplace=True)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0) # 预览 print("Unique values in 'Condition2' column in training data:\n", X_train['Condition2'].unique()) print("\nUnique values in 'Condition2' column in validation data:\n", X_valid['Condition2'].unique())
# 方法 2:顺序编码 print('-'*25+"\n方法 2:顺序编码") object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"] good_label_cols = [col for col in object_cols if set(X_valid[col]).issubset(set(X_train[col]))] bad_label_cols = list(set(object_cols)-set(good_label_cols))
# 方法 3:独热编码 print('-'*25+"\n方法 3:独热编码") low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10] high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.model_selection import cross_val_score import matplotlib.pyplot as plt %matplotlib inline
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True) y = train_data.SalePrice train_data.drop(['SalePrice'], axis=1, inplace=True)
numeric_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ['int64', 'float64']] X = train_data[numeric_cols].copy() X_test = test_data[numeric_cols].copy()
# 评估函数 defget_score(n_estimators): """Return the average MAE over 3 CV folds of random forest model. Keyword argument: n_estimators -- the number of trees in the forest """ # Replace this body with your own code pipeline_tmp = Pipeline(steps=[ ("preprocessor", SimpleImputer()), ("model", RandomForestRegressor(n_estimators=n_estimators, random_state=0)) ]) # 交叉验证 - 得分 scores_tmp = -1 * cross_val_score(pipeline_tmp, X, y, cv=3, scoring="neg_mean_absolute_error") print(f"{n_estimators} estimators MAE: {scores_tmp}") return scores_tmp.mean()
# 测试不同的 n_estimators results = {} for i inrange(1, 9): results[50*i] = get_score(50*i)
import pandas as pd from sklearn.model_selection import train_test_split from xgboost import XGBRegressor from sklearn.metrics import mean_absolute_error
# 数据准备 X = pd.read_csv('../input/train.csv', index_col='Id') X_test_full = pd.read_csv('../input/test.csv', index_col='Id')
X.dropna(axis=0, subset=['SalePrice'], inplace=True) y = X.SalePrice X.drop(['SalePrice'], axis=1, inplace=True)
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10and X_train_full[cname].dtype == "object"]
numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]
import pandas as pd from sklearn.pipeline import make_pipeline from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score
# 数据准备 data = pd.read_csv('../input/AER_credit_card_data.csv', true_values = ['yes'], false_values = ['no']) y = data.card X = data.drop(['card'], axis=1) print("Number of rows in the dataset:", X.shape[0]) my_pipeline = make_pipeline(RandomForestClassifier(n_estimators=100)) cv_scores = cross_val_score(my_pipeline, X, y, cv=5, scoring='accuracy') # 准确度非常高 print("Cross-val accuracy: %f" % cv_scores.mean())
# 查看情况 print('-'*25) expenditures_cardholders = X.expenditure[y] expenditures_noncardholders = X.expenditure[~y] print('Fraction of those who did not receive a card and had no expenditures: %.2f' \ %((expenditures_noncardholders == 0).mean())) print('Fraction of those who received a card and had no expenditures: %.2f' \ %(( expenditures_cardholders == 0).mean()))
Number of rows in the dataset: 1319
Cross-val accuracy: 0.978779
-------------------------
Fraction of those who did not receive a card and had no expenditures: 1.00
Fraction of those who received a card and had no expenditures: 0.02
-------------------------
Cross-val accuracy: 0.834719