File size: 14,305 Bytes
2202679
 
 
2d3f7d4
 
2202679
c25d9fa
2202679
c25d9fa
2202679
1812a7a
2e48f97
c50bbfb
 
 
1812a7a
c50bbfb
1812a7a
2202679
 
 
2a7c621
1812a7a
2202679
 
 
c50bbfb
1812a7a
c25d9fa
 
 
 
 
 
 
 
 
 
 
2202679
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
091c949
8a5806f
 
 
 
2202679
8a5806f
 
 
 
 
 
c50bbfb
8a5806f
2a7c621
e3af011
 
2202679
2a7c621
2202679
2e48f97
e3af011
2a7c621
2202679
2e48f97
2a7c621
e3af011
2202679
2e48f97
 
2202679
2a7c621
 
 
 
 
 
 
 
 
 
 
2d3f7d4
c25d9fa
2d3f7d4
 
 
673ecdc
2d3f7d4
64c6c05
 
2d3f7d4
 
 
 
e2e752f
 
2202679
 
2d3f7d4
e2e752f
2d3f7d4
2202679
2d3f7d4
2202679
 
617b96b
2202679
 
 
617b96b
2202679
 
617b96b
2202679
 
14e9f1b
2202679
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d3f7d4
3c8ed5d
 
 
ecdd8e8
 
 
 
3c8ed5d
bda761e
 
 
 
ecdd8e8
bda761e
3c8ed5d
bda761e
 
 
 
 
 
2d3f7d4
 
bda761e
2d3f7d4
3c8ed5d
bda761e
 
 
 
 
673ecdc
14e9f1b
2d3f7d4
2202679
 
 
 
 
 
 
673ecdc
2202679
 
 
 
 
 
 
673ecdc
2d3f7d4
2202679
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353

import os, shutil

import streamlit as st
import pandas as pd
import numpy as np
import joblib
import os
from huggingface_hub import hf_hub_download
from sklearn.preprocessing import LabelEncoder, StandardScaler
from catboost import Pool

# Hugging Face Model Repo
MODEL_REPO = "chagu13/is_click_predictor"
MODEL_DIR = "models"
os.makedirs(MODEL_DIR, exist_ok=True)

# Model Filenames
CATBOOST_MODEL_FILENAME = "models/catboost_model.pkl"
XGB_MODEL_FILENAME = "models/xgb_model.pkl"
RF_MODEL_FILENAME = "models/rf_model.pkl"

# Local Paths
CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.pkl")
XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.pkl")
RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl")

# Define Features
CATEGORICAL_COLUMNS = ["gender", "product", "campaign_id", "webpage_id"]
NUMERICAL_COLUMNS = [
    "age_level", "city_development_index", "user_group_id", "user_depth", "var_1",
    "click_sum_age_sex_prod", "click_count_age_sex_prod",
    "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
    "click_sum_city_age_prod", "click_count_city_age_prod",
    "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
]

FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS

from sklearn.preprocessing import LabelEncoder, StandardScaler
from catboost import Pool


def preprocess_input(input_df, expected_feature_order):
    """
    Ensure preprocessing is correct:
    - Removes duplicate columns
    - Computes aggregations using only test data
    - Ensures categorical variables are properly encoded
    - Normalizes numerical features
    - Adds `is_click` column with 0 for compatibility
    - Orders columns as expected by the model
    """
    # Drop the DateTime column if it exists
    if "DateTime" in input_df.columns:
        input_df.drop(columns=["DateTime"], inplace=True)

    # Remove duplicate columns
    input_df = input_df.loc[:, ~input_df.columns.duplicated()]
    input_df.fillna(0, inplace=True)

    # Aggregate by age & gender vs product
    age_sex_product_agg = input_df.groupby(["age_level", "gender", "product"]).agg({
        "campaign_id": "nunique",
        "webpage_id": "nunique"
    }).reset_index()

    # Fix renaming: Remove missing columns
    age_sex_product_agg.columns = ["age_level", "gender", "product",
                                   "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod"]

    input_df = input_df.merge(age_sex_product_agg, on=["age_level", "gender", "product"], how="left")

    # Aggregate by city, age, product
    city_age_product_agg = input_df.groupby(["city_development_index", "age_level", "product"]).agg({
        "campaign_id": "nunique",
        "webpage_id": "nunique"
    }).reset_index()

    # Fix renaming: Remove missing columns
    city_age_product_agg.columns = ["city_development_index", "age_level", "product",
                                    "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"]

    input_df = input_df.merge(city_age_product_agg, on=["city_development_index", "age_level", "product"], how="left")
    input_df.fillna(0, inplace=True)

    # **Ensure missing columns exist (Important Fix)**
    missing_columns = ["click_sum_age_sex_prod", "click_count_age_sex_prod",
                       "click_sum_city_age_prod", "click_count_city_age_prod"]

    for col in missing_columns:
        if col not in input_df.columns:
            print(f"Warning: Missing column {col}. Filling with 0.")
            input_df[col] = 0  # Fill missing columns with default values

    # **Add `is_click` column with 0 for compatibility**
    if "is_click" not in input_df.columns:
        print("Adding `is_click` column with all values set to 0.")
        input_df["is_click"] = 0  # Model will ignore this for prediction

    # Feature List (Now includes `is_click`)
    features = ["age_level", "gender", "product", "campaign_id", "webpage_id",
                "product_category_1", "product_category_2", "user_group_id",
                "user_depth", "city_development_index", "var_1",
                "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
                "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod",
                "click_sum_age_sex_prod", "click_count_age_sex_prod",
                "click_sum_city_age_prod", "click_count_city_age_prod",
                "is_click"]  # Included for compatibility

    categorical_columns = ["gender", "product", "campaign_id", "webpage_id"]

    # ===========================
    #  ENCODE CATEGORICAL FEATURES
    # ===========================

    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        input_df[col] = le.fit_transform(input_df[col].astype(str))  # Apply transformation correctly
        label_encoders[col] = le  # Store encoder for reference

    # Normalize numerical features
    numerical_columns = [col for col in features if col not in categorical_columns]
    scaler = StandardScaler()
    input_df[numerical_columns] = scaler.fit_transform(input_df[numerical_columns])

    # ===========================
    #  ENFORCE FEATURE ORDER
    # ===========================
    missing_features = set(expected_feature_order) - set(input_df.columns)
    extra_features = set(input_df.columns) - set(expected_feature_order)

    # Add missing features with default values
    for col in missing_features:
        print(f"Warning: Missing feature {col}. Filling with 0.")
        input_df[col] = 0

    # Drop unexpected features
    if extra_features:
        print(f"Warning: Dropping unexpected features: {extra_features}")
        input_df = input_df.drop(columns=list(extra_features))

    # Reorder columns to match the model's expected input
    input_df = input_df[expected_feature_order]

    return input_df


def download_model(filename, local_path):
    """Download model from Hugging Face and move it to the correct location."""
    temp_path = hf_hub_download(repo_id=MODEL_REPO, filename=filename, local_dir=MODEL_DIR)

    # Ensure correct file placement
    if temp_path != local_path:
        shutil.move(temp_path, local_path)

    return local_path


def load_models():
    """Download and load models from Hugging Face."""
    try:
        print("πŸ”„ Checking and downloading models...")

        # Ensure models are downloaded and placed correctly
        if not os.path.exists(CATBOOST_MODEL_PATH):
            print("πŸš€ Downloading CatBoost model...")
            download_model(CATBOOST_MODEL_FILENAME, CATBOOST_MODEL_PATH)

        if not os.path.exists(XGB_MODEL_PATH):
            print("πŸš€ Downloading XGBoost model...")
            download_model(XGB_MODEL_FILENAME, XGB_MODEL_PATH)

        if not os.path.exists(RF_MODEL_PATH):
            print("πŸš€ Downloading RandomForest model...")
            download_model(RF_MODEL_FILENAME, RF_MODEL_PATH)

        # βœ… Load models
        print("πŸ“¦ Loading models...")
        catboost_model = joblib.load(CATBOOST_MODEL_PATH)
        xgb_model = joblib.load(XGB_MODEL_PATH)
        rf_model = joblib.load(RF_MODEL_PATH)

        print("βœ… Models loaded successfully!")
        return catboost_model, xgb_model, rf_model

    except Exception as e:
        print(f"❌ Error loading models: {e}")
        return None, None, None

# Streamlit UI
st.title("Is_Click Predictor - ML Model Inference")
st.info("Upload a CSV file, and the trained models will predict click probability.")

catboost, xgb, rf = load_models()

expected_feature_order = catboost.feature_names_
print("Expected Feature Order:", expected_feature_order)
# Upload File
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
if uploaded_file:
    input_df = pd.read_csv(uploaded_file)
    st.success("File uploaded successfully!")

    # βœ… Compute aggregations & preprocess
    input_df = preprocess_input(input_df, expected_feature_order)

    # βœ… Make Predictions
    st.subheader("Predictions in Progress...")
    from catboost import Pool

    # Define categorical features (MUST MATCH what was used during training)
    cat_features = ["gender", "product", "campaign_id", "webpage_id"]

    # Convert categorical features to strings (MUST be string, not float)
    for col in cat_features:
        input_df[col] = input_df[col].astype(str)

    expected_feature_order = catboost.feature_names_
    print("Expected Feature Order:", expected_feature_order)

    # Ensure input_df has the correct column order
    input_df = input_df[expected_feature_order]

    input_pool = Pool(input_df, cat_features=cat_features)
    catboost_preds = catboost.predict(input_pool)
    catboost_probs = catboost.predict_proba(input_df)[:, 1]
    label_encoders = {}  # Store encoders to ensure consistency

    for col in cat_features:
        le = LabelEncoder()
        input_df[col] = input_df[col].astype(str)  # Ensure it's a string
        le.fit(input_df[col])  # Fit only on input_df (since training is done)
        label_encoders[col] = le  # Save encoder for reference
        input_df[col] = le.transform(input_df[col])

    # List of features used during training for XGBoost
    xgb_training_features = [
        "age_level", "gender", "product", "campaign_id", "webpage_id",
        "product_category_1", "product_category_2", "user_group_id",
        "user_depth", "city_development_index", "var_1",
        "click_sum_age_sex_prod", "click_count_age_sex_prod",
        "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
        "click_sum_city_age_prod", "click_count_city_age_prod",
        "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
    ]

    xgb_preds = xgb.predict(input_df[xgb_training_features])

    # # πŸ”₯ List of features RandomForest was trained with
    # rf_training_features = [
    #     "age_level", "gender", "product", "campaign_id", "webpage_id",
    #     "product_category_1", "product_category_2", "user_group_id",
    #     "user_depth", "city_development_index", "var_1",
    #     "click_sum_age_sex_prod", "click_count_age_sex_prod",
    #     "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
    #     "click_sum_city_age_prod", "click_count_city_age_prod",
    #     "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
    # ]
    #
    # # βœ… Ensure all training features exist in `input_df`
    # for col in rf_training_features:
    #     if col not in input_df.columns:
    #         input_df[col] = 0  # Default missing columns to 0
    #
    # # Get intersection of trained features and current input_df columns
    # common_features = list(set(rf.feature_names_in_) & set(input_df.columns))
    #
    # # Select only the matching features
    # input_df_rf = input_df[common_features]
    #
    # # Predict without needing to add missing features
    # rf_preds = rf.predict(input_df_rf)
    #
    #
    # print("RF Model Trained Features:", rf.feature_names_in_)
    # print("Input Data Features:", input_df_rf.columns.tolist())
    #
    # # Debugging: Check for missing or extra features
    # missing_features = set(rf.feature_names_in_) - set(input_df_rf.columns)
    # extra_features = set(input_df_rf.columns) - set(rf.feature_names_in_)
    #
    # print("Missing Features in Input:", missing_features)
    # print("Extra Features in Input:", extra_features)
    # # βœ… Make Predictions with RandomForest
    # rf_preds = rf.predict(input_df_rf)

    xgb_probs = xgb.predict_proba(input_df)[:, 1]
    #rf_probs = rf.predict_proba(input_df)[:, 1]
 #test
    # Combine results
    # βœ… Apply Threshold to Convert Probabilities into Binary Predictions
    THRESHOLD = 0.7  # Adjust to control false positives

    # βœ… Debugging: Print probability distributions before thresholding
    print("πŸ” Probability Distributions Before Thresholding:")
    print("CatBoost:\n", pd.Series(catboost_probs).describe())
    print("XGBoost:\n", pd.Series(xgb_probs).describe())

    # βœ… Dynamically Adjust Threshold Based on Probability Distribution
    THRESHOLD = np.percentile(catboost_probs, 95)  # Use 95th percentile
    print(f"βœ… Adjusted CatBoost Threshold: {THRESHOLD:.3f}")

    catboost_preds = (catboost_probs >= THRESHOLD).astype(int)
    xgb_preds = (xgb_probs >= 0.7).astype(int)  # Keep static for comparison

    # βœ… Debugging: Count of 1s and 0s after thresholding
    print("\nPost-threshold Distribution:")
    print(f"CatBoost 1s: {np.sum(catboost_preds)} / {len(catboost_preds)}")
    print(f"XGBoost 1s: {np.sum(xgb_preds)} / {len(xgb_preds)}")

    # βœ… Fix `predictions_df` After Thresholding
    predictions_df = pd.DataFrame({
        "CatBoost": catboost_preds,
        "XGBoost": xgb_preds
    })

    # βœ… Ensure Not All Are Predicted as Clicks
    if predictions_df["CatBoost"].sum() == len(predictions_df) or predictions_df["XGBoost"].sum() == len(
            predictions_df):
        print("⚠ Warning: Model is predicting only 1s! Consider adjusting thresholds.")

    # Apply "at least one model predicts 1" rule
    predictions_df["is_click_predicted"] = predictions_df.max(axis=1)

    # Generate probability file
    probabilities_df = pd.DataFrame({
        "CatBoost_Prob": catboost_probs,
        "XGBoost_Prob": xgb_probs,
      #  "RandomForest_Prob": rf_probs
    })

    # Save results
    binary_predictions_path = "binary_predictions.csv"
    filtered_predictions_path = "filtered_predictions.csv"
    probabilities_path = "model_probabilities.csv"

    predictions_df.to_csv(binary_predictions_path, index=False)
    predictions_df[predictions_df["is_click_predicted"] == 1].to_csv(filtered_predictions_path, index=False)
    probabilities_df.to_csv(probabilities_path, index=False)

    st.success("Predictions completed! Download results below.")

    # Download Buttons
    with open(binary_predictions_path, "rb") as f:
        st.download_button("Download Binary Predictions (0/1)", f, file_name="binary_predictions.csv")

    with open(filtered_predictions_path, "rb") as f:
        st.download_button("Download Clicked Predictions (Only 1s)", f, file_name="filtered_predictions.csv")

    with open(probabilities_path, "rb") as f:
        st.download_button("Download Probability Predictions", f, file_name="model_probabilities.csv")