aideml / sample_results /playground-series-s3e20.py
dominikschmidt's picture
add open-source AIDE
39c930a
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
# Load the data
train_data = pd.read_csv("./input/train.csv")
test_data = pd.read_csv("./input/test.csv")
# Preprocess the data
train_data[["ID", "latitude", "longitude", "year", "week_no"]] = train_data[
"ID_LAT_LON_YEAR_WEEK"
].str.split("_", expand=True)
test_data[["ID", "latitude", "longitude", "year", "week_no"]] = test_data[
"ID_LAT_LON_YEAR_WEEK"
].str.split("_", expand=True)
# Convert to numeric types
for col in ["latitude", "longitude", "year", "week_no"]:
train_data[col] = pd.to_numeric(train_data[col])
test_data[col] = pd.to_numeric(test_data[col])
# One-hot encoding for 'week_no'
train_data = pd.get_dummies(train_data, columns=["week_no"])
test_data = pd.get_dummies(test_data, columns=["week_no"])
# Align test_data columns with train_data
test_data = test_data.reindex(columns=train_data.columns, fill_value=0)
# Prepare the data for training
X = train_data.drop(columns=["emission", "ID_LAT_LON_YEAR_WEEK", "ID"])
y = train_data["emission"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the LightGBM model
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
params = {"objective": "regression", "metric": "rmse", "verbose": -1}
gbm = lgb.train(
params,
lgb_train,
num_boost_round=100,
valid_sets=lgb_eval,
early_stopping_rounds=10,
)
# Predict on validation set
y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
# Evaluate the model
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"Validation RMSE: {rmse}")
# Predict on test set and save submission
test_features = test_data.drop(columns=["ID_LAT_LON_YEAR_WEEK", "ID", "emission"])
test_data["emission"] = gbm.predict(test_features, num_iteration=gbm.best_iteration)
submission = test_data[["ID_LAT_LON_YEAR_WEEK", "emission"]]
submission.to_csv("./working/submission.csv", index=False)