|
import pandas as pd |
|
import numpy as np |
|
from sklearn.model_selection import KFold |
|
from sklearn.metrics import roc_auc_score |
|
from sklearn.feature_selection import RFECV |
|
import lightgbm as lgb |
|
from bayes_opt import BayesianOptimization |
|
|
|
|
|
train_data = pd.read_csv("./input/train.csv") |
|
test_data = pd.read_csv("./input/test.csv") |
|
|
|
|
|
X = train_data.drop(["id", "defects"], axis=1) |
|
y = train_data["defects"] |
|
X_test = test_data.drop("id", axis=1) |
|
test_ids = test_data["id"] |
|
|
|
|
|
best_params = { |
|
"num_leaves": 31, |
|
"learning_rate": 0.05, |
|
"subsample": 0.8, |
|
"colsample_bytree": 0.8, |
|
"max_depth": 15, |
|
"reg_alpha": 0.5, |
|
"reg_lambda": 0.5, |
|
"objective": "binary", |
|
"metric": "auc", |
|
"verbosity": -1, |
|
"n_jobs": -1, |
|
"random_state": 42, |
|
} |
|
lgb_model = lgb.LGBMClassifier(**best_params) |
|
|
|
|
|
rfecv = RFECV(estimator=lgb_model, step=1, cv=KFold(10), scoring="roc_auc", n_jobs=-1) |
|
rfecv.fit(X, y) |
|
|
|
|
|
print(f"Optimal number of features: {rfecv.n_features_}") |
|
|
|
|
|
X_selected = rfecv.transform(X) |
|
X_test_selected = rfecv.transform(X_test) |
|
|
|
|
|
lgb_model.fit(X_selected, y) |
|
|
|
|
|
final_predictions = lgb_model.predict_proba(X_test_selected)[:, 1] |
|
|
|
|
|
submission = pd.DataFrame({"id": test_ids, "defects": final_predictions}) |
|
submission.to_csv("./working/submission.csv", index=False) |
|
|
|
|
|
auc_scores = [] |
|
kf = KFold(n_splits=10, shuffle=True, random_state=42) |
|
for train_index, valid_index in kf.split(X_selected): |
|
X_train, X_valid = X_selected[train_index], X_selected[valid_index] |
|
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] |
|
lgb_model.fit(X_train, y_train) |
|
y_pred = lgb_model.predict_proba(X_valid)[:, 1] |
|
auc_score = roc_auc_score(y_valid, y_pred) |
|
auc_scores.append(auc_score) |
|
|
|
|
|
mean_auc_score = np.mean(auc_scores) |
|
print(f"Mean AUC Score with Selected Features: {mean_auc_score}") |
|
|