|
import pandas as pd |
|
import numpy as np |
|
from sklearn.model_selection import train_test_split |
|
from lightgbm import LGBMRegressor |
|
from sklearn.metrics import mean_squared_log_error |
|
|
|
|
|
train = pd.read_csv("./input/train.csv") |
|
test = pd.read_csv("./input/test.csv") |
|
|
|
|
|
|
|
def preprocess_data(data): |
|
data["datetime"] = pd.to_datetime(data["datetime"]) |
|
data["hour"] = data["datetime"].dt.hour |
|
data["day_of_week"] = data["datetime"].dt.dayofweek |
|
data["month"] = data["datetime"].dt.month |
|
data["year"] = data["datetime"].dt.year |
|
data["day"] = data["datetime"].dt.day |
|
data["hour_workingday_interaction"] = data["hour"] * data["workingday"] |
|
|
|
|
|
data["hour_sin"] = np.sin(data.hour * (2.0 * np.pi / 24)) |
|
data["hour_cos"] = np.cos(data.hour * (2.0 * np.pi / 24)) |
|
data["day_of_week_sin"] = np.sin(data.day_of_week * (2.0 * np.pi / 7)) |
|
data["day_of_week_cos"] = np.cos(data.day_of_week * (2.0 * np.pi / 7)) |
|
data["month_sin"] = np.sin((data.month - 1) * (2.0 * np.pi / 12)) |
|
data["month_cos"] = np.cos((data.month - 1) * (2.0 * np.pi / 12)) |
|
|
|
return data.drop(["datetime", "casual", "registered"], axis=1, errors="ignore") |
|
|
|
|
|
train = preprocess_data(train) |
|
test = preprocess_data(test) |
|
|
|
|
|
X = train.drop(["count"], axis=1) |
|
y = np.log1p(train["count"]) |
|
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
|
|
model = LGBMRegressor(n_estimators=100, learning_rate=0.05, random_state=42) |
|
model.fit(X_train, y_train) |
|
|
|
|
|
y_pred = model.predict(X_val) |
|
rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(y_pred))) |
|
print(f"RMSLE with cyclic features: {rmsle}") |
|
|
|
|
|
test_pred = model.predict(test) |
|
submission = pd.DataFrame( |
|
{ |
|
"datetime": pd.read_csv("./input/test.csv")["datetime"], |
|
"count": np.expm1(test_pred), |
|
} |
|
) |
|
submission.to_csv("./working/submission.csv", index=False) |
|
|