Spaces:

dexhunter
/

aideml

Running

App Files Files Community

aideml / sample_results /playground-series-s3e24.py

dominikschmidt

add open-source AIDE

39c930a about 1 year ago

raw

history blame contribute delete

3.37 kB

	import pandas as pd
	import lightgbm as lgb
	from sklearn.model_selection import train_test_split, KFold
	from sklearn.metrics import roc_auc_score
	from sklearn.preprocessing import StandardScaler
	from bayes_opt import BayesianOptimization

	# Load the data
	train_data = pd.read_csv("./input/train.csv")
	test_data = pd.read_csv("./input/test.csv")

	# Prepare the data
	X = train_data.drop(["id", "smoking"], axis=1)
	y = train_data["smoking"]
	X_test = test_data.drop("id", axis=1)

	# Split the data into training and validation sets
	X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

	# Scale the features
	scaler = StandardScaler()
	X_train_scaled = scaler.fit_transform(X_train)
	X_val_scaled = scaler.transform(X_val)
	X_test_scaled = scaler.transform(X_test)


	# Define the LightGBM cross-validation function
	def lgb_cv(
	learning_rate,
	num_leaves,
	min_child_samples,
	subsample,
	colsample_bytree,
	max_depth,
	reg_alpha,
	reg_lambda,
	n_estimators,
	):
	params = {
	"objective": "binary",
	"metric": "auc",
	"boosting_type": "gbdt",
	"learning_rate": max(min(learning_rate, 1), 0),
	"n_estimators": int(n_estimators),
	"verbose": -1,
	"num_leaves": int(num_leaves),
	"min_child_samples": int(min_child_samples),
	"subsample": max(min(subsample, 1), 0),
	"colsample_bytree": max(min(colsample_bytree, 1), 0),
	"max_depth": int(max_depth),
	"reg_alpha": max(reg_alpha, 0),
	"reg_lambda": max(reg_lambda, 0),
	}
	cv_result = lgb.cv(
	params,
	lgb.Dataset(X_train_scaled, label=y_train),
	nfold=10,
	seed=42,
	stratified=True,
	verbose_eval=200,
	metrics=["auc"],
	)
	return max(cv_result["auc-mean"])


	# Define the parameter bounds
	param_bounds = {
	"learning_rate": (0.01, 0.2),
	"num_leaves": (20, 60),
	"min_child_samples": (5, 50),
	"subsample": (0.6, 1.0),
	"colsample_bytree": (0.6, 1.0),
	"max_depth": (5, 15),
	"reg_alpha": (0, 1),
	"reg_lambda": (0, 1),
	"n_estimators": (100, 1000), # Increased range for n_estimators
	}

	# Perform Bayesian optimization with increased initial points and iterations
	optimizer = BayesianOptimization(f=lgb_cv, pbounds=param_bounds, random_state=42)
	optimizer.maximize(init_points=10, n_iter=50)

	# Retrieve the best parameters
	best_params = optimizer.max["params"]
	best_params["num_leaves"] = int(best_params["num_leaves"])
	best_params["min_child_samples"] = int(best_params["min_child_samples"])
	best_params["max_depth"] = int(best_params["max_depth"])
	best_params["n_estimators"] = int(best_params["n_estimators"])

	# Train and validate the model with the best parameters
	final_gbm = lgb.LGBMClassifier(**best_params)
	final_gbm.fit(X_train_scaled, y_train)
	val_predictions = final_gbm.predict_proba(X_val_scaled)[:, 1]
	val_auc = roc_auc_score(y_val, val_predictions)
	print(f"Validation AUC score: {val_auc}")

	# Train the model on the full dataset with the best parameters and make predictions on the scaled test set
	final_gbm.fit(scaler.fit_transform(X), y)
	predictions = final_gbm.predict_proba(X_test_scaled)[:, 1]

	# Prepare the submission file
	submission = pd.DataFrame({"id": test_data["id"], "smoking": predictions})
	submission.to_csv("./working/submission.csv", index=False)