|
import pandas as pd |
|
from sklearn.ensemble import RandomForestClassifier |
|
from sklearn.model_selection import train_test_split, GridSearchCV |
|
from sklearn.metrics import roc_auc_score |
|
from sklearn.preprocessing import OneHotEncoder |
|
|
|
|
|
train_data = pd.read_csv("./input/train.csv") |
|
test_data = pd.read_csv("./input/test.csv") |
|
|
|
|
|
encoder = OneHotEncoder(sparse=False, handle_unknown="ignore") |
|
encoded_features = encoder.fit_transform(train_data[["Product ID", "Type"]]) |
|
encoded_test_features = encoder.transform(test_data[["Product ID", "Type"]]) |
|
|
|
|
|
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out()) |
|
train_data = train_data.join(encoded_df).drop(["Product ID", "Type"], axis=1) |
|
|
|
encoded_test_df = pd.DataFrame( |
|
encoded_test_features, columns=encoder.get_feature_names_out() |
|
) |
|
test_data = test_data.join(encoded_test_df).drop(["Product ID", "Type"], axis=1) |
|
|
|
|
|
X = train_data.drop(["Machine failure", "id"], axis=1) |
|
y = train_data["Machine failure"] |
|
|
|
|
|
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
|
|
rf = RandomForestClassifier(random_state=42) |
|
|
|
|
|
param_grid = { |
|
"n_estimators": [100, 200], |
|
"max_depth": [None, 10, 20], |
|
"min_samples_split": [2, 5], |
|
"min_samples_leaf": [1, 2], |
|
} |
|
|
|
|
|
grid_search = GridSearchCV( |
|
estimator=rf, param_grid=param_grid, cv=3, scoring="roc_auc", n_jobs=-1 |
|
) |
|
|
|
|
|
grid_search.fit(X_train, y_train) |
|
|
|
|
|
best_rf = grid_search.best_estimator_ |
|
|
|
|
|
y_pred_proba = best_rf.predict_proba(X_val)[:, 1] |
|
|
|
|
|
auc_roc = roc_auc_score(y_val, y_pred_proba) |
|
print(f"AUC-ROC score: {auc_roc}") |
|
|
|
|
|
test_predictions = best_rf.predict_proba(test_data.drop("id", axis=1))[:, 1] |
|
|
|
|
|
submission = pd.DataFrame({"id": test_data["id"], "Machine failure": test_predictions}) |
|
submission.to_csv("./working/submission.csv", index=False) |
|
|