# Set up filepaths import os ifnot os.path.exists("../input/train.csv"): os.symlink("../input/home-data-for-ml-course/train.csv", "../input/train.csv") os.symlink("../input/home-data-for-ml-course/test.csv", "../input/test.csv")
# Import helpful libraries import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error from sklearn.model_selection import train_test_split
# Load the data, and separate the target iowa_file_path = '../input/train.csv' home_data = pd.read_csv(iowa_file_path) y = home_data.SalePrice
# Create X (After completing the exercise, you can return to modify this line!) features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
# Select columns corresponding to features, and preview the data X = home_data[features] print(X.head())
# Split into validation and training data train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
# Define a random forest model rf_model = RandomForestRegressor(random_state=1) rf_model.fit(train_X, train_y) rf_val_predictions = rf_model.predict(val_X) rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)
print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))
# To improve accuracy, create a new Random Forest model which you will train on all training data rf_model_on_full_data = RandomForestRegressor(random_state=1)
# fit rf_model_on_full_data on all data from the training data rf_model_on_full_data.fit(X, y)
RandomForestRegressor(random_state=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor(random_state=1)
3. 模型预测
1 2 3 4 5 6 7 8 9 10 11 12
# path to file you will use for predictions test_data_path = '../input/test.csv'
# read test data file using pandas test_data = pd.read_csv(test_data_path)
# create test_X which comes from test_data but includes only the columns you used for prediction. # The list of columns is stored in a variable called features test_X = test_data[features]
# make predictions which we will submit. test_preds = rf_model_on_full_data.predict(test_X)
4. 导出结果
1 2 3 4
# Run the code to save predictions in the format used for competition scoring output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': test_preds}) output.to_csv('submission.csv', index=False)