|
import pandas as pd |
|
import numpy as np |
|
from lightgbm import LGBMClassifier |
|
from sklearn.ensemble import ( |
|
RandomForestClassifier, |
|
VotingClassifier, |
|
GradientBoostingClassifier, |
|
) |
|
from sklearn.linear_model import LogisticRegression |
|
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV |
|
from sklearn.metrics import roc_auc_score |
|
|
|
|
|
train_data = pd.read_csv("./input/train.csv") |
|
test_data = pd.read_csv("./input/test.csv") |
|
|
|
|
|
common_features = list(set(train_data.columns) & set(test_data.columns)) |
|
common_features.remove("id") |
|
|
|
|
|
X_train = train_data[common_features] |
|
y_train_EC1 = train_data["EC1"] |
|
y_train_EC2 = train_data["EC2"] |
|
X_test = test_data[common_features] |
|
|
|
|
|
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) |
|
|
|
|
|
model1_EC1 = LGBMClassifier(random_state=42) |
|
model1_EC2 = LGBMClassifier(random_state=42) |
|
model2 = RandomForestClassifier(n_estimators=100, random_state=42) |
|
model3 = LogisticRegression(max_iter=1000) |
|
model4 = GradientBoostingClassifier(random_state=42) |
|
|
|
|
|
param_grid = { |
|
"n_estimators": [100, 200], |
|
"learning_rate": [0.05, 0.1], |
|
"max_depth": [3, 5], |
|
} |
|
|
|
|
|
grid_search = GridSearchCV(model4, param_grid, cv=skf, scoring="roc_auc") |
|
grid_search.fit( |
|
X_train, y_train_EC1 |
|
) |
|
best_params = grid_search.best_params_ |
|
|
|
|
|
model4 = GradientBoostingClassifier(random_state=42, **best_params) |
|
|
|
|
|
voting_clf_EC1 = VotingClassifier( |
|
estimators=[("lgbm", model1_EC1), ("rf", model2), ("lr", model3), ("gbc", model4)], |
|
voting="soft", |
|
) |
|
voting_clf_EC2 = VotingClassifier( |
|
estimators=[("lgbm", model1_EC2), ("rf", model2), ("lr", model3), ("gbc", model4)], |
|
voting="soft", |
|
) |
|
|
|
|
|
cv_scores_EC1 = cross_val_score( |
|
voting_clf_EC1, X_train, y_train_EC1, cv=skf, scoring="roc_auc" |
|
) |
|
auc_EC1 = np.mean(cv_scores_EC1) |
|
|
|
|
|
cv_scores_EC2 = cross_val_score( |
|
voting_clf_EC2, X_train, y_train_EC2, cv=skf, scoring="roc_auc" |
|
) |
|
auc_EC2 = np.mean(cv_scores_EC2) |
|
|
|
|
|
print(f"Validation AUC for EC1: {auc_EC1}") |
|
print(f"Validation AUC for EC2: {auc_EC2}") |
|
print(f"Average Validation AUC: {(auc_EC1 + auc_EC2) / 2}") |
|
|
|
|
|
voting_clf_EC1.fit(X_train, y_train_EC1) |
|
voting_clf_EC2.fit(X_train, y_train_EC2) |
|
|
|
|
|
test_data["EC1"] = voting_clf_EC1.predict_proba(X_test)[:, 1] |
|
test_data["EC2"] = voting_clf_EC2.predict_proba(X_test)[:, 1] |
|
|
|
|
|
submission = test_data[["id", "EC1", "EC2"]] |
|
submission.to_csv("./working/submission.csv", index=False) |
|
|