import os, shutil import streamlit as st import pandas as pd import numpy as np import joblib import os from huggingface_hub import hf_hub_download from sklearn.preprocessing import LabelEncoder, StandardScaler from catboost import Pool # Hugging Face Model Repo MODEL_REPO = "chagu13/is_click_predictor" MODEL_DIR = "models" os.makedirs(MODEL_DIR, exist_ok=True) # Model Filenames CATBOOST_MODEL_FILENAME = "models/catboost_model.pkl" XGB_MODEL_FILENAME = "models/xgb_model.pkl" RF_MODEL_FILENAME = "models/rf_model.pkl" # Local Paths CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.pkl") XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.pkl") RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl") # Define Features CATEGORICAL_COLUMNS = ["gender", "product", "campaign_id", "webpage_id"] NUMERICAL_COLUMNS = [ "age_level", "city_development_index", "user_group_id", "user_depth", "var_1", "click_sum_age_sex_prod", "click_count_age_sex_prod", "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod", "click_sum_city_age_prod", "click_count_city_age_prod", "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod" ] FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS from sklearn.preprocessing import LabelEncoder, StandardScaler from catboost import Pool def preprocess_input(input_df, expected_feature_order): """ Ensure preprocessing is correct: - Removes duplicate columns - Computes aggregations using only test data - Ensures categorical variables are properly encoded - Normalizes numerical features - Adds `is_click` column with 0 for compatibility - Orders columns as expected by the model """ # Drop the DateTime column if it exists if "DateTime" in input_df.columns: input_df.drop(columns=["DateTime"], inplace=True) # Remove duplicate columns input_df = input_df.loc[:, ~input_df.columns.duplicated()] input_df.fillna(0, inplace=True) # Aggregate by age & gender vs product age_sex_product_agg = input_df.groupby(["age_level", "gender", "product"]).agg({ "campaign_id": "nunique", "webpage_id": "nunique" }).reset_index() # Fix renaming: Remove missing columns age_sex_product_agg.columns = ["age_level", "gender", "product", "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod"] input_df = input_df.merge(age_sex_product_agg, on=["age_level", "gender", "product"], how="left") # Aggregate by city, age, product city_age_product_agg = input_df.groupby(["city_development_index", "age_level", "product"]).agg({ "campaign_id": "nunique", "webpage_id": "nunique" }).reset_index() # Fix renaming: Remove missing columns city_age_product_agg.columns = ["city_development_index", "age_level", "product", "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"] input_df = input_df.merge(city_age_product_agg, on=["city_development_index", "age_level", "product"], how="left") input_df.fillna(0, inplace=True) # **Ensure missing columns exist (Important Fix)** missing_columns = ["click_sum_age_sex_prod", "click_count_age_sex_prod", "click_sum_city_age_prod", "click_count_city_age_prod"] for col in missing_columns: if col not in input_df.columns: print(f"Warning: Missing column {col}. Filling with 0.") input_df[col] = 0 # Fill missing columns with default values # **Add `is_click` column with 0 for compatibility** if "is_click" not in input_df.columns: print("Adding `is_click` column with all values set to 0.") input_df["is_click"] = 0 # Model will ignore this for prediction # Feature List (Now includes `is_click`) features = ["age_level", "gender", "product", "campaign_id", "webpage_id", "product_category_1", "product_category_2", "user_group_id", "user_depth", "city_development_index", "var_1", "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod", "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod", "click_sum_age_sex_prod", "click_count_age_sex_prod", "click_sum_city_age_prod", "click_count_city_age_prod", "is_click"] # Included for compatibility categorical_columns = ["gender", "product", "campaign_id", "webpage_id"] # =========================== # ENCODE CATEGORICAL FEATURES # =========================== label_encoders = {} for col in categorical_columns: le = LabelEncoder() input_df[col] = le.fit_transform(input_df[col].astype(str)) # Apply transformation correctly label_encoders[col] = le # Store encoder for reference # Normalize numerical features numerical_columns = [col for col in features if col not in categorical_columns] scaler = StandardScaler() input_df[numerical_columns] = scaler.fit_transform(input_df[numerical_columns]) # =========================== # ENFORCE FEATURE ORDER # =========================== missing_features = set(expected_feature_order) - set(input_df.columns) extra_features = set(input_df.columns) - set(expected_feature_order) # Add missing features with default values for col in missing_features: print(f"Warning: Missing feature {col}. Filling with 0.") input_df[col] = 0 # Drop unexpected features if extra_features: print(f"Warning: Dropping unexpected features: {extra_features}") input_df = input_df.drop(columns=list(extra_features)) # Reorder columns to match the model's expected input input_df = input_df[expected_feature_order] return input_df def download_model(filename, local_path): """Download model from Hugging Face and move it to the correct location.""" temp_path = hf_hub_download(repo_id=MODEL_REPO, filename=filename, local_dir=MODEL_DIR) # Ensure correct file placement if temp_path != local_path: shutil.move(temp_path, local_path) return local_path def load_models(): """Download and load models from Hugging Face.""" try: print("🔄 Checking and downloading models...") # Ensure models are downloaded and placed correctly if not os.path.exists(CATBOOST_MODEL_PATH): print("🚀 Downloading CatBoost model...") download_model(CATBOOST_MODEL_FILENAME, CATBOOST_MODEL_PATH) if not os.path.exists(XGB_MODEL_PATH): print("🚀 Downloading XGBoost model...") download_model(XGB_MODEL_FILENAME, XGB_MODEL_PATH) if not os.path.exists(RF_MODEL_PATH): print("🚀 Downloading RandomForest model...") download_model(RF_MODEL_FILENAME, RF_MODEL_PATH) # ✅ Load models print("📦 Loading models...") catboost_model = joblib.load(CATBOOST_MODEL_PATH) xgb_model = joblib.load(XGB_MODEL_PATH) rf_model = joblib.load(RF_MODEL_PATH) print("✅ Models loaded successfully!") return catboost_model, xgb_model, rf_model except Exception as e: print(f"❌ Error loading models: {e}") return None, None, None # Streamlit UI st.title("Is_Click Predictor - ML Model Inference") st.info("Upload a CSV file, and the trained models will predict click probability.") catboost, xgb, rf = load_models() expected_feature_order = catboost.feature_names_ print("Expected Feature Order:", expected_feature_order) # Upload File uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"]) if uploaded_file: input_df = pd.read_csv(uploaded_file) st.success("File uploaded successfully!") # ✅ Compute aggregations & preprocess input_df = preprocess_input(input_df, expected_feature_order) # ✅ Make Predictions st.subheader("Predictions in Progress...") from catboost import Pool # Define categorical features (MUST MATCH what was used during training) cat_features = ["gender", "product", "campaign_id", "webpage_id"] # Convert categorical features to strings (MUST be string, not float) for col in cat_features: input_df[col] = input_df[col].astype(str) expected_feature_order = catboost.feature_names_ print("Expected Feature Order:", expected_feature_order) # Ensure input_df has the correct column order input_df = input_df[expected_feature_order] input_pool = Pool(input_df, cat_features=cat_features) catboost_preds = catboost.predict(input_pool) catboost_probs = catboost.predict_proba(input_df)[:, 1] label_encoders = {} # Store encoders to ensure consistency for col in cat_features: le = LabelEncoder() input_df[col] = input_df[col].astype(str) # Ensure it's a string le.fit(input_df[col]) # Fit only on input_df (since training is done) label_encoders[col] = le # Save encoder for reference input_df[col] = le.transform(input_df[col]) # List of features used during training for XGBoost xgb_training_features = [ "age_level", "gender", "product", "campaign_id", "webpage_id", "product_category_1", "product_category_2", "user_group_id", "user_depth", "city_development_index", "var_1", "click_sum_age_sex_prod", "click_count_age_sex_prod", "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod", "click_sum_city_age_prod", "click_count_city_age_prod", "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod" ] xgb_preds = xgb.predict(input_df[xgb_training_features]) # # 🔥 List of features RandomForest was trained with # rf_training_features = [ # "age_level", "gender", "product", "campaign_id", "webpage_id", # "product_category_1", "product_category_2", "user_group_id", # "user_depth", "city_development_index", "var_1", # "click_sum_age_sex_prod", "click_count_age_sex_prod", # "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod", # "click_sum_city_age_prod", "click_count_city_age_prod", # "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod" # ] # # # ✅ Ensure all training features exist in `input_df` # for col in rf_training_features: # if col not in input_df.columns: # input_df[col] = 0 # Default missing columns to 0 # # # Get intersection of trained features and current input_df columns # common_features = list(set(rf.feature_names_in_) & set(input_df.columns)) # # # Select only the matching features # input_df_rf = input_df[common_features] # # # Predict without needing to add missing features # rf_preds = rf.predict(input_df_rf) # # # print("RF Model Trained Features:", rf.feature_names_in_) # print("Input Data Features:", input_df_rf.columns.tolist()) # # # Debugging: Check for missing or extra features # missing_features = set(rf.feature_names_in_) - set(input_df_rf.columns) # extra_features = set(input_df_rf.columns) - set(rf.feature_names_in_) # # print("Missing Features in Input:", missing_features) # print("Extra Features in Input:", extra_features) # # ✅ Make Predictions with RandomForest # rf_preds = rf.predict(input_df_rf) xgb_probs = xgb.predict_proba(input_df)[:, 1] #rf_probs = rf.predict_proba(input_df)[:, 1] #test # Combine results # ✅ Apply Threshold to Convert Probabilities into Binary Predictions THRESHOLD = 0.7 # Adjust to control false positives # ✅ Debugging: Print probability distributions before thresholding print("🔍 Probability Distributions Before Thresholding:") print("CatBoost:\n", pd.Series(catboost_probs).describe()) print("XGBoost:\n", pd.Series(xgb_probs).describe()) # ✅ Dynamically Adjust Threshold Based on Probability Distribution THRESHOLD = np.percentile(catboost_probs, 95) # Use 95th percentile print(f"✅ Adjusted CatBoost Threshold: {THRESHOLD:.3f}") catboost_preds = (catboost_probs >= THRESHOLD).astype(int) xgb_preds = (xgb_probs >= 0.7).astype(int) # Keep static for comparison # ✅ Debugging: Count of 1s and 0s after thresholding print("\nPost-threshold Distribution:") print(f"CatBoost 1s: {np.sum(catboost_preds)} / {len(catboost_preds)}") print(f"XGBoost 1s: {np.sum(xgb_preds)} / {len(xgb_preds)}") # ✅ Fix `predictions_df` After Thresholding predictions_df = pd.DataFrame({ "CatBoost": catboost_preds, "XGBoost": xgb_preds }) # ✅ Ensure Not All Are Predicted as Clicks if predictions_df["CatBoost"].sum() == len(predictions_df) or predictions_df["XGBoost"].sum() == len( predictions_df): print("⚠ Warning: Model is predicting only 1s! Consider adjusting thresholds.") # Apply "at least one model predicts 1" rule predictions_df["is_click_predicted"] = predictions_df.max(axis=1) # Generate probability file probabilities_df = pd.DataFrame({ "CatBoost_Prob": catboost_probs, "XGBoost_Prob": xgb_probs, # "RandomForest_Prob": rf_probs }) # Save results binary_predictions_path = "binary_predictions.csv" filtered_predictions_path = "filtered_predictions.csv" probabilities_path = "model_probabilities.csv" predictions_df.to_csv(binary_predictions_path, index=False) predictions_df[predictions_df["is_click_predicted"] == 1].to_csv(filtered_predictions_path, index=False) probabilities_df.to_csv(probabilities_path, index=False) st.success("Predictions completed! Download results below.") # Download Buttons with open(binary_predictions_path, "rb") as f: st.download_button("Download Binary Predictions (0/1)", f, file_name="binary_predictions.csv") with open(filtered_predictions_path, "rb") as f: st.download_button("Download Clicked Predictions (Only 1s)", f, file_name="filtered_predictions.csv") with open(probabilities_path, "rb") as f: st.download_button("Download Probability Predictions", f, file_name="model_probabilities.csv")