File size: 2,594 Bytes
5cbc1e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, KFold
import numpy as np
from sklearn.metrics import make_scorer

# Load the data
train_data = pd.read_csv("./input/train.csv")
test_data = pd.read_csv("./input/test.csv")


# Preprocess the data
def preprocess_data(data):
    data["date"] = pd.to_datetime(data["date"])
    data["year"] = data["date"].dt.year
    data["month"] = data["date"].dt.month
    data["day"] = data["date"].dt.day
    data["dayofweek"] = data["date"].dt.dayofweek
    data["is_weekend"] = data["dayofweek"].apply(lambda x: 1 if x >= 5 else 0)
    data["week_of_year"] = data["date"].dt.isocalendar().week.astype(int)
    data["day_of_year"] = data["date"].dt.dayofyear
    data["sin_day_of_year"] = np.sin(2 * np.pi * data["day_of_year"] / 365.25)
    data["cos_day_of_year"] = np.cos(2 * np.pi * data["day_of_year"] / 365.25)
    data["country_month"] = data["country"] + "_" + data["month"].astype(str)
    data = pd.get_dummies(
        data, columns=["country", "store", "product", "country_month"]
    )
    return data


train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Prepare the data for LightGBM
X = train_data.drop(["num_sold", "date", "row_id"], axis=1)
y = train_data["num_sold"]


# Define SMAPE function
def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)


# Custom scorer for cross-validation
smape_scorer = make_scorer(smape, greater_is_better=False)

# Define the parameter grid
param_grid = {
    "num_leaves": [31, 50, 70],
    "learning_rate": [0.1, 0.05, 0.01],
    "n_estimators": [100, 200, 500],
}

# Perform grid search with 10-fold cross-validation
gbm = lgb.LGBMRegressor(objective="regression", metric="mae", verbose=-1)
kf = KFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    estimator=gbm, param_grid=param_grid, cv=kf, scoring=smape_scorer, verbose=1
)
grid_search.fit(X, y)

# Train the model on full data with the best parameters
best_params = grid_search.best_params_
gbm_best = lgb.LGBMRegressor(**best_params)
gbm_best.fit(X, y)

# Predict on test set and save the submission file
X_test = test_data.drop(["date", "row_id"], axis=1)
test_data["num_sold"] = gbm_best.predict(X_test)
submission = test_data[["row_id", "num_sold"]]
submission.to_csv("./working/submission.csv", index=False)

# Print the evaluation metric
print("Best SMAPE (GridSearchCV):", -grid_search.best_score_)