File size: 3,368 Bytes
5cbc1e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from bayes_opt import BayesianOptimization
# Load the data
train_data = pd.read_csv("./input/train.csv")
test_data = pd.read_csv("./input/test.csv")
# Prepare the data
X = train_data.drop(["id", "smoking"], axis=1)
y = train_data["smoking"]
X_test = test_data.drop("id", axis=1)
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
# Define the LightGBM cross-validation function
def lgb_cv(
learning_rate,
num_leaves,
min_child_samples,
subsample,
colsample_bytree,
max_depth,
reg_alpha,
reg_lambda,
n_estimators,
):
params = {
"objective": "binary",
"metric": "auc",
"boosting_type": "gbdt",
"learning_rate": max(min(learning_rate, 1), 0),
"n_estimators": int(n_estimators),
"verbose": -1,
"num_leaves": int(num_leaves),
"min_child_samples": int(min_child_samples),
"subsample": max(min(subsample, 1), 0),
"colsample_bytree": max(min(colsample_bytree, 1), 0),
"max_depth": int(max_depth),
"reg_alpha": max(reg_alpha, 0),
"reg_lambda": max(reg_lambda, 0),
}
cv_result = lgb.cv(
params,
lgb.Dataset(X_train_scaled, label=y_train),
nfold=10,
seed=42,
stratified=True,
verbose_eval=200,
metrics=["auc"],
)
return max(cv_result["auc-mean"])
# Define the parameter bounds
param_bounds = {
"learning_rate": (0.01, 0.2),
"num_leaves": (20, 60),
"min_child_samples": (5, 50),
"subsample": (0.6, 1.0),
"colsample_bytree": (0.6, 1.0),
"max_depth": (5, 15),
"reg_alpha": (0, 1),
"reg_lambda": (0, 1),
"n_estimators": (100, 1000), # Increased range for n_estimators
}
# Perform Bayesian optimization with increased initial points and iterations
optimizer = BayesianOptimization(f=lgb_cv, pbounds=param_bounds, random_state=42)
optimizer.maximize(init_points=10, n_iter=50)
# Retrieve the best parameters
best_params = optimizer.max["params"]
best_params["num_leaves"] = int(best_params["num_leaves"])
best_params["min_child_samples"] = int(best_params["min_child_samples"])
best_params["max_depth"] = int(best_params["max_depth"])
best_params["n_estimators"] = int(best_params["n_estimators"])
# Train and validate the model with the best parameters
final_gbm = lgb.LGBMClassifier(**best_params)
final_gbm.fit(X_train_scaled, y_train)
val_predictions = final_gbm.predict_proba(X_val_scaled)[:, 1]
val_auc = roc_auc_score(y_val, val_predictions)
print(f"Validation AUC score: {val_auc}")
# Train the model on the full dataset with the best parameters and make predictions on the scaled test set
final_gbm.fit(scaler.fit_transform(X), y)
predictions = final_gbm.predict_proba(X_test_scaled)[:, 1]
# Prepare the submission file
submission = pd.DataFrame({"id": test_data["id"], "smoking": predictions})
submission.to_csv("./working/submission.csv", index=False)
|