Spaces:

dexhunter
/

aideml

Running

File size: 3,368 Bytes

5cbc1e9

import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from bayes_opt import BayesianOptimization

# Load the data
train_data = pd.read_csv("./input/train.csv")
test_data = pd.read_csv("./input/test.csv")

# Prepare the data
X = train_data.drop(["id", "smoking"], axis=1)
y = train_data["smoking"]
X_test = test_data.drop("id", axis=1)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


# Define the LightGBM cross-validation function
def lgb_cv(
    learning_rate,
    num_leaves,
    min_child_samples,
    subsample,
    colsample_bytree,
    max_depth,
    reg_alpha,
    reg_lambda,
    n_estimators,
):
    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",
        "learning_rate": max(min(learning_rate, 1), 0),
        "n_estimators": int(n_estimators),
        "verbose": -1,
        "num_leaves": int(num_leaves),
        "min_child_samples": int(min_child_samples),
        "subsample": max(min(subsample, 1), 0),
        "colsample_bytree": max(min(colsample_bytree, 1), 0),
        "max_depth": int(max_depth),
        "reg_alpha": max(reg_alpha, 0),
        "reg_lambda": max(reg_lambda, 0),
    }
    cv_result = lgb.cv(
        params,
        lgb.Dataset(X_train_scaled, label=y_train),
        nfold=10,
        seed=42,
        stratified=True,
        verbose_eval=200,
        metrics=["auc"],
    )
    return max(cv_result["auc-mean"])


# Define the parameter bounds
param_bounds = {
    "learning_rate": (0.01, 0.2),
    "num_leaves": (20, 60),
    "min_child_samples": (5, 50),
    "subsample": (0.6, 1.0),
    "colsample_bytree": (0.6, 1.0),
    "max_depth": (5, 15),
    "reg_alpha": (0, 1),
    "reg_lambda": (0, 1),
    "n_estimators": (100, 1000),  # Increased range for n_estimators
}

# Perform Bayesian optimization with increased initial points and iterations
optimizer = BayesianOptimization(f=lgb_cv, pbounds=param_bounds, random_state=42)
optimizer.maximize(init_points=10, n_iter=50)

# Retrieve the best parameters
best_params = optimizer.max["params"]
best_params["num_leaves"] = int(best_params["num_leaves"])
best_params["min_child_samples"] = int(best_params["min_child_samples"])
best_params["max_depth"] = int(best_params["max_depth"])
best_params["n_estimators"] = int(best_params["n_estimators"])

# Train and validate the model with the best parameters
final_gbm = lgb.LGBMClassifier(**best_params)
final_gbm.fit(X_train_scaled, y_train)
val_predictions = final_gbm.predict_proba(X_val_scaled)[:, 1]
val_auc = roc_auc_score(y_val, val_predictions)
print(f"Validation AUC score: {val_auc}")

# Train the model on the full dataset with the best parameters and make predictions on the scaled test set
final_gbm.fit(scaler.fit_transform(X), y)
predictions = final_gbm.predict_proba(X_test_scaled)[:, 1]

# Prepare the submission file
submission = pd.DataFrame({"id": test_data["id"], "smoking": predictions})
submission.to_csv("./working/submission.csv", index=False)