File size: 2,568 Bytes
5cbc1e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
import numpy as np
from sklearn.preprocessing import LabelEncoder


def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return 100 * np.mean(diff)


# Load data
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

# Convert date to datetime and extract features
for df in [train, test]:
    df["date"] = pd.to_datetime(df["date"])
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["day_of_week"] = df["date"].dt.dayofweek  # Extract day of the week
    # Create interaction terms
    df["country_store"] = df["country"] + "_" + df["store"]
    df["country_product"] = df["country"] + "_" + df["product"]
    df["store_product"] = df["store"] + "_" + df["product"]

# Encode new categorical features
label_encoders = {}
for col in ["country_store", "country_product", "store_product"]:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

# Define categorical features including new interaction terms
cat_features = [
    "country",
    "store",
    "product",
    "country_store",
    "country_product",
    "store_product",
]

# Prepare data for training
X = train.drop(["num_sold", "date", "id"], axis=1)
y = train["num_sold"]

# Cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
smape_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]

    model = CatBoostRegressor(
        iterations=2000,  # Increase iterations
        learning_rate=0.05,  # Decrease learning rate
        depth=8,  # Increase depth
        loss_function="MAE",
        cat_features=cat_features,
        verbose=200,
    )
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid), use_best_model=True)

    preds = model.predict(X_valid)
    score = smape(y_valid, preds)
    smape_scores.append(score)

print(f"Average SMAPE: {np.mean(smape_scores)}")

# Prepare test data and make predictions
test_data = test.drop(["date", "id"], axis=1)
predictions = model.predict(test_data)

# Save submission
submission = pd.DataFrame({"id": test["id"], "num_sold": predictions})
submission.to_csv("./working/submission.csv", index=False)