|
import pandas as pd |
|
import lightgbm as lgb |
|
from sklearn.model_selection import RandomizedSearchCV, KFold |
|
from sklearn.metrics import log_loss, make_scorer |
|
from sklearn.preprocessing import LabelEncoder |
|
import numpy as np |
|
|
|
|
|
train_data = pd.read_csv("./input/train.csv") |
|
test_data = pd.read_csv("./input/test.csv") |
|
|
|
|
|
le = LabelEncoder() |
|
train_data["EJ"] = le.fit_transform(train_data["EJ"]) |
|
test_data["EJ"] = le.transform(test_data["EJ"]) |
|
|
|
|
|
X = train_data.drop(["Id", "Class"], axis=1) |
|
y = train_data["Class"] |
|
X_test = test_data.drop("Id", axis=1) |
|
|
|
|
|
model = lgb.LGBMClassifier(objective="binary", boosting_type="gbdt", is_unbalance=True) |
|
param_grid = { |
|
"learning_rate": [0.01, 0.05, 0.1], |
|
"num_leaves": [15, 31, 63], |
|
"max_depth": [-1, 5, 10], |
|
"min_child_samples": [10, 20, 30], |
|
"max_bin": [255, 300], |
|
"subsample": [0.6, 0.8, 1.0], |
|
"colsample_bytree": [0.3, 0.5, 0.7], |
|
} |
|
|
|
|
|
log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True) |
|
|
|
|
|
random_search = RandomizedSearchCV( |
|
model, |
|
param_distributions=param_grid, |
|
n_iter=10, |
|
scoring=log_loss_scorer, |
|
cv=KFold(n_splits=10, shuffle=True, random_state=42), |
|
random_state=42, |
|
verbose=1, |
|
) |
|
|
|
random_search.fit(X, y) |
|
|
|
|
|
best_model = random_search.best_estimator_ |
|
best_score = -random_search.best_score_ |
|
print(f"Best Log Loss: {best_score}") |
|
|
|
|
|
test_predictions = best_model.predict_proba(X_test)[:, 1] |
|
|
|
|
|
submission = pd.DataFrame( |
|
{ |
|
"Id": test_data["Id"], |
|
"class_0": 1 - test_predictions, |
|
"class_1": test_predictions, |
|
} |
|
) |
|
submission.to_csv("./working/submission.csv", index=False) |
|
|