|
import pandas as pd |
|
from sklearn.ensemble import RandomForestRegressor |
|
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV |
|
from sklearn.metrics import make_scorer |
|
from sklearn.preprocessing import OneHotEncoder |
|
from sklearn.compose import ColumnTransformer |
|
from sklearn.pipeline import Pipeline |
|
from sklearn.impute import SimpleImputer |
|
|
|
|
|
train_data = pd.read_csv("./input/train.csv") |
|
census_data = pd.read_csv("./input/census_starter.csv") |
|
test_data = pd.read_csv("./input/test.csv") |
|
|
|
|
|
train_data = train_data.merge(census_data, on="cfips", how="left") |
|
test_data = test_data.merge(census_data, on="cfips", how="left") |
|
|
|
|
|
numerical_transformer = SimpleImputer(strategy="median") |
|
|
|
|
|
feature_columns = train_data.select_dtypes(exclude=["object", "datetime"]).columns.drop( |
|
"microbusiness_density" |
|
) |
|
|
|
|
|
preprocessor = ColumnTransformer( |
|
transformers=[ |
|
("num", numerical_transformer, feature_columns), |
|
] |
|
) |
|
|
|
|
|
model = RandomForestRegressor(random_state=0) |
|
|
|
|
|
my_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)]) |
|
|
|
|
|
|
|
def smape(actual, predicted): |
|
denominator = (abs(actual) + abs(predicted)) / 2.0 |
|
diff = abs(predicted - actual) / denominator |
|
diff[denominator == 0] = 0.0 |
|
return 100 * diff.mean() |
|
|
|
|
|
smape_scorer = make_scorer(smape, greater_is_better=False) |
|
|
|
|
|
param_grid = { |
|
"model__n_estimators": [50, 100, 150], |
|
"model__max_depth": [None, 10, 20, 30], |
|
"model__min_samples_split": [2, 5, 10], |
|
} |
|
|
|
|
|
grid_search = GridSearchCV( |
|
my_pipeline, param_grid=param_grid, cv=3, scoring=smape_scorer, n_jobs=-1 |
|
) |
|
|
|
|
|
grid_search.fit(train_data[feature_columns], train_data["microbusiness_density"]) |
|
|
|
|
|
print(f"Best parameters: {grid_search.best_params_}") |
|
print(f"Best SMAPE score: {-grid_search.best_score_}") |
|
|
|
|
|
best_pipeline = grid_search.best_estimator_ |
|
best_pipeline.fit(train_data[feature_columns], train_data["microbusiness_density"]) |
|
test_preds = best_pipeline.predict(test_data[feature_columns]) |
|
|
|
|
|
output = pd.DataFrame({"row_id": test_data.row_id, "microbusiness_density": test_preds}) |
|
output.to_csv("./working/submission.csv", index=False) |
|
|