Spaces:

samkeet
/

MediaMixOptimization

Sleeping

File size: 26,884 Bytes

00b00eb

import streamlit as st
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import pickle
import os
from utilities_with_panel import load_local_css, set_header
import yaml
from yaml import SafeLoader
import sqlite3
from datetime import timedelta
from utilities import (
    set_header,
    load_local_css,
    update_db,
    project_selection,
    retrieve_pkl_object,
)
from utilities_with_panel import (
    overview_test_data_prep_panel,
    overview_test_data_prep_nonpanel,
    initialize_data_cmp,
    create_channel_summary,
    create_contribution_pie,
    create_contribuion_stacked_plot,
    create_channel_spends_sales_plot,
    format_numbers,
    channel_name_formating,
)
from log_application import log_message
import sys, traceback
from post_gres_cred import db_cred

st.set_page_config(layout="wide")
load_local_css("styles.css")
set_header()


schema = db_cred["schema"]

if "username" not in st.session_state:
    st.session_state["username"] = None

if "project_name" not in st.session_state:
    st.session_state["project_name"] = None

if "project_dct" not in st.session_state:
    project_selection()
    st.stop()

tuned_model = retrieve_pkl_object(
    st.session_state["project_number"], "Model_Tuning", "tuned_model", schema
)

if tuned_model is None:
    st.error("Please save a tuned model")
    st.stop()

if (
    "session_state_saved" in st.session_state["project_dct"]["model_tuning"].keys()
    and st.session_state["project_dct"]["model_tuning"]["session_state_saved"] != []
):
    for key in ["used_response_metrics", "media_data", "bin_dict"]:
        if key not in st.session_state:
            st.session_state[key] = st.session_state["project_dct"]["model_tuning"][
                "session_state_saved"
            ][key]


## DEFINE ALL FUNCTIONS
def get_random_effects(media_data, panel_col, mdf):
    random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"])
    for i, market in enumerate(media_data[panel_col].unique()):
        print(i, end="\r")
        intercept = mdf.random_effects[market].values[0]
        random_eff_df.loc[i, "random_effect"] = intercept
        random_eff_df.loc[i, panel_col] = market

    return random_eff_df


def process_train_and_test(train, test, features, panel_col, target_col):
    X1 = train[features]

    ss = MinMaxScaler()
    X1 = pd.DataFrame(ss.fit_transform(X1), columns=X1.columns)

    X1[panel_col] = train[panel_col]
    X1[target_col] = train[target_col]

    if test is not None:
        X2 = test[features]
        X2 = pd.DataFrame(ss.transform(X2), columns=X2.columns)
        X2[panel_col] = test[panel_col]
        X2[target_col] = test[target_col]
        return X1, X2
    return X1


def mdf_predict(X_df, mdf, random_eff_df):
    X = X_df.copy()
    X = pd.merge(
        X,
        random_eff_df[[panel_col, "random_effect"]],
        on=panel_col,
        how="left",
    )
    X["pred_fixed_effect"] = mdf.predict(X)

    X["pred"] = X["pred_fixed_effect"] + X["random_effect"]
    X.drop(columns=["pred_fixed_effect", "random_effect"], inplace=True)

    return X


try:
    if "username" in st.session_state and st.session_state["username"] is not None:

        # conn = sqlite3.connect(
        #     r"DB/User.db", check_same_thread=False
        # )  # connection with sql db
        # c = conn.cursor()

        tuned_model = retrieve_pkl_object(
            st.session_state["project_number"], "Model_Tuning", "tuned_model", schema
        )

        if tuned_model is None:
            st.error("Please save a tuned model")
            st.stop()

        if (
            "session_state_saved"
            in st.session_state["project_dct"]["model_tuning"].keys()
            and st.session_state["project_dct"]["model_tuning"]["session_state_saved"]
            != []
        ):
            for key in [
                "used_response_metrics",
                "is_tuned_model",
                "media_data",
                "X_test_spends",
                "spends_data",
            ]:
                st.session_state[key] = st.session_state["project_dct"]["model_tuning"][
                    "session_state_saved"
                ][key]
        elif (
            "session_state_saved"
            in st.session_state["project_dct"]["model_build"].keys()
            and st.session_state["project_dct"]["model_build"]["session_state_saved"]
            != []
        ):
            for key in [
                "used_response_metrics",
                "date",
                "saved_model_names",
                "media_data",
                "X_test_spends",
            ]:
                st.session_state[key] = st.session_state["project_dct"]["model_build"][
                    "session_state_saved"
                ][key]
        else:
            st.error("Please tune a model first")
        st.session_state["bin_dict"] = st.session_state["project_dct"]["model_build"][
            "session_state_saved"
        ]["bin_dict"]
        st.session_state["media_data"].columns = [
            c.lower() for c in st.session_state["media_data"].columns
        ]

        # with open(
        #     os.path.join(st.session_state["project_path"], "data_import.pkl"),
        #     "rb",
        # ) as f:
        #     data = pickle.load(f)

        # # Accessing the loaded objects

        # st.session_state["orig_media_data"] = data["final_df"]

        st.session_state["orig_media_data"] = st.session_state["project_dct"][
            "data_import"
        ][
            "imputed_tool_df"
        ].copy()  # db
        st.session_state["channels"] = st.session_state["project_dct"]["data_import"][
            "group_dict"
        ].copy()
        # target='Revenue'

        # set the panel column
        panel_col = "panel"
        is_panel = (
            True if st.session_state["media_data"][panel_col].nunique() > 1 else False
        )

        date_col = "date"

        cols1 = st.columns([2, 1])
        with cols1[0]:
            st.markdown(f"**Welcome {st.session_state['username']}**")
        with cols1[1]:
            st.markdown(f"**Current Project: {st.session_state['project_name']}**")

        st.title("AI Model Media Performance")

        def remove_response_metric(name):
            # Convert the name to a lowercase string and remove any leading or trailing spaces
            name_str = str(name).lower().strip()

            # Check if the name starts with "response metric" or "response_metric"
            if name_str.startswith("response metric"):
                return name[len("response metric") :].replace("_", " ").strip().title()
            elif name_str.startswith("response_metric"):
                return name[len("response_metric") :].replace("_", " ").strip().title()
            else:
                return name

        sel_target_col = st.selectbox(
            "Select the response metric",
            st.session_state["used_response_metrics"],
            format_func=remove_response_metric,
        )
        sel_target_col_frmttd = sel_target_col.replace("_", " ").replace("-", " ")
        sel_target_col_frmttd = sel_target_col_frmttd.title()
        target_col = (
            sel_target_col.lower()
            .replace(" ", "_")
            .replace("-", "")
            .replace(":", "")
            .replace("__", "_")
        )
        target = sel_target_col

        # Contribution
        if is_panel:
            # read tuned mixedLM model
            if st.session_state["is_tuned_model"][target_col] == True:

                model_dict = retrieve_pkl_object(
                    st.session_state["project_number"],
                    "Model_Tuning",
                    "tuned_model",
                    schema,
                )  # db

                saved_models = list(model_dict.keys())
                required_saved_models = [
                    m.split("__")[0]
                    for m in saved_models
                    if m.split("__")[1] == target_col
                ]

                sel_model = required_saved_models[
                    0
                ]  # only 1 tuned model available per resp metric

                sel_model_dict = model_dict[sel_model + "__" + target_col]

                model = sel_model_dict["Model_object"]
                X_train = sel_model_dict["X_train_tuned"]
                X_test = sel_model_dict["X_test_tuned"]
                best_feature_set = sel_model_dict["feature_set"]

            # Calculate contributions

            st.session_state["orig_media_data"].columns = [
                col.lower()
                .replace(".", "_")
                .replace("@", "_")
                .replace(" ", "_")
                .replace("-", "")
                .replace(":", "")
                .replace("__", "_")
                for col in st.session_state["orig_media_data"].columns
            ]

            media_data = st.session_state["media_data"]

            contri_df = pd.DataFrame()

            y = []
            y_pred = []

            random_eff_df = get_random_effects(media_data, panel_col, model)
            random_eff_df["fixed_effect"] = model.fe_params["Intercept"]
            random_eff_df["panel_effect"] = (
                random_eff_df["random_effect"] + random_eff_df["fixed_effect"]
            )

            coef_df = pd.DataFrame(model.fe_params)
            coef_df.reset_index(inplace=True)
            coef_df.columns = ["feature", "coef"]

            x_train_contribution = X_train.copy()
            x_test_contribution = X_test.copy()

            # preprocessing not needed since X_train is already preprocessed
            # X1, X2 = process_train_and_test(x_train_contribution, x_test_contribution, best_feature_set, panel_col, target_col)
            # x_train_contribution[best_feature_set] = X1[best_feature_set]
            # x_test_contribution[best_feature_set] = X2[best_feature_set]

            x_train_contribution = mdf_predict(
                x_train_contribution, model, random_eff_df
            )
            x_test_contribution = mdf_predict(x_test_contribution, model, random_eff_df)

            x_train_contribution = pd.merge(
                x_train_contribution,
                random_eff_df[[panel_col, "panel_effect"]],
                on=panel_col,
                how="left",
            )
            x_test_contribution = pd.merge(
                x_test_contribution,
                random_eff_df[[panel_col, "panel_effect"]],
                on=panel_col,
                how="left",
            )

            for i in range(len(coef_df))[1:]:
                coef = coef_df.loc[i, "coef"]
                col = coef_df.loc[i, "feature"]
                if col.lower() != "intercept":
                    x_train_contribution[str(col) + "_contr"] = (
                        coef * x_train_contribution[col]
                    )
                    x_test_contribution[str(col) + "_contr"] = (
                        coef * x_train_contribution[col]
                    )

            tuning_cols = [
                c
                for c in x_train_contribution.filter(regex="contr").columns
                if c
                in [
                    "day_of_week_contr",
                    "Trend_contr",
                    "sine_wave_contr",
                    "cosine_wave_contr",
                ]
            ]
            flag_cols = [
                c
                for c in x_train_contribution.filter(regex="contr").columns
                if "_flag" in c
            ]

            # add exogenous contribution to base
            all_exog_vars = st.session_state["bin_dict"]["Exogenous"]
            all_exog_vars = [
                var.lower()
                .replace(".", "_")
                .replace("@", "_")
                .replace(" ", "_")
                .replace("-", "")
                .replace(":", "")
                .replace("__", "_")
                for var in all_exog_vars
            ]
            exog_cols = []
            if len(all_exog_vars) > 0:
                for col in x_train_contribution.filter(regex="contr").columns:
                    if (
                        len([exog_var for exog_var in all_exog_vars if exog_var in col])
                        > 0
                    ):
                        exog_cols.append(col)

            base_cols = ["panel_effect"] + flag_cols + tuning_cols + exog_cols

            x_train_contribution["base_contr"] = x_train_contribution[base_cols].sum(
                axis=1
            )
            x_train_contribution.drop(columns=base_cols, inplace=True)
            x_test_contribution["base_contr"] = x_test_contribution[base_cols].sum(
                axis=1
            )
            x_test_contribution.drop(columns=base_cols, inplace=True)

            overall_contributions = pd.concat(
                [x_train_contribution, x_test_contribution]
            ).reset_index(drop=True)

            overview_test_data_prep_panel(
                overall_contributions,
                st.session_state["orig_media_data"],
                st.session_state["spends_data"],
                date_col,
                panel_col,
                target_col,
            )

        else:  # NON PANEL
            if st.session_state["is_tuned_model"][target_col] == True:  # Sprint4
                # with open(
                #     os.path.join(st.session_state["project_path"], "tuned_model.pkl"),
                #     "rb",
                # ) as file:
                #     model_dict = pickle.load(file)

                model_dict = retrieve_pkl_object(
                    st.session_state["project_number"],
                    "Model_Tuning",
                    "tuned_model",
                    schema,
                )  # db

                saved_models = list(model_dict.keys())
                required_saved_models = [
                    m.split("__")[0]
                    for m in saved_models
                    if m.split("__")[1] == target_col
                ]

                sel_model = required_saved_models[
                    0
                ]  # only 1 tuned model available per resp metric
                sel_model_dict = model_dict[sel_model + "__" + target_col]

                model = sel_model_dict["Model_object"]
                X_train = sel_model_dict["X_train_tuned"]
                X_test = sel_model_dict["X_test_tuned"]
                best_feature_set = sel_model_dict["feature_set"]

            x_train_contribution = X_train.copy()
            x_test_contribution = X_test.copy()

            x_train_contribution["pred"] = model.predict(
                x_train_contribution[best_feature_set]
            )
            x_test_contribution["pred"] = model.predict(
                x_test_contribution[best_feature_set]
            )

            coef_df = pd.DataFrame(model.params)
            coef_df.reset_index(inplace=True)
            coef_df.columns = ["feature", "coef"]

            # st.write(coef_df)
            for i in range(len(coef_df)):
                coef = coef_df.loc[i, "coef"]
                col = coef_df.loc[i, "feature"]
                if col != "const":
                    x_train_contribution[str(col) + "_contr"] = (
                        coef * x_train_contribution[col]
                    )
                    x_test_contribution[str(col) + "_contr"] = (
                        coef * x_test_contribution[col]
                    )
                else:
                    x_train_contribution["const"] = coef
                    x_test_contribution["const"] = coef

            tuning_cols = [
                c
                for c in x_train_contribution.filter(regex="contr").columns
                if c
                in [
                    "day_of_week_contr",
                    "Trend_contr",
                    "sine_wave_contr",
                    "cosine_wave_contr",
                ]
            ]
            flag_cols = [
                c
                for c in x_train_contribution.filter(regex="contr").columns
                if "_flag" in c
            ]

            # add exogenous contribution to base
            all_exog_vars = st.session_state["bin_dict"]["Exogenous"]
            all_exog_vars = [
                var.lower()
                .replace(".", "_")
                .replace("@", "_")
                .replace(" ", "_")
                .replace("-", "")
                .replace(":", "")
                .replace("__", "_")
                for var in all_exog_vars
            ]
            exog_cols = []
            if len(all_exog_vars) > 0:
                for col in x_train_contribution.filter(regex="contr").columns:
                    if (
                        len([exog_var for exog_var in all_exog_vars if exog_var in col])
                        > 0
                    ):
                        exog_cols.append(col)

            base_cols = ["const"] + flag_cols + tuning_cols + exog_cols
            # st.write(base_cols)
            x_train_contribution["base_contr"] = x_train_contribution[base_cols].sum(
                axis=1
            )
            x_train_contribution.drop(columns=base_cols, inplace=True)

            x_test_contribution["base_contr"] = x_test_contribution[base_cols].sum(
                axis=1
            )
            x_test_contribution.drop(columns=base_cols, inplace=True)
            # x_test_contribution.to_csv("Test/test_contr.csv", index=False)

            overall_contributions = pd.concat(
                [x_train_contribution, x_test_contribution]
            ).reset_index(drop=True)
            # overall_contributions.to_csv("Test/overall_contributions.csv", index=False)

            overview_test_data_prep_nonpanel(
                overall_contributions,
                st.session_state["orig_media_data"].copy(),
                st.session_state["spends_data"].copy(),
                date_col,
                target_col,
            )
        # for k, v in st.session_sta
        # te.items():

        #     if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'):
        #         st.session_state[k] = v

        # authenticator = st.session_state.get('authenticator')

        # if authenticator is None:
        #     authenticator = load_authenticator()

        # name, authentication_status, username = authenticator.login('Login', 'main')
        # auth_status = st.session_state['authentication_status']

        # if auth_status:
        #     authenticator.logout('Logout', 'main')

        #     is_state_initiaized = st.session_state.get('initialized',False)
        #     if not is_state_initiaized:

        min_date = X_train[date_col].min().date()
        max_date = X_test[date_col].max().date()
        if "media_performance" not in st.session_state["project_dct"]:
            st.session_state["project_dct"]["media_performance"] = {
                "start_date": None,
                "end_date": None,
            }

        start_default = st.session_state["project_dct"]["media_performance"].get(
            "start_date", None
        )
        start_default = start_default if start_default is not None else min_date
        start_default = start_default if start_default > min_date else min_date
        start_default = start_default if start_default < max_date else min_date

        end_default = st.session_state["project_dct"]["media_performance"].get(
            "end_date", None
        )
        end_default = end_default if end_default is not None else max_date
        end_default = end_default if end_default > min_date else max_date
        end_default = end_default if end_default < max_date else max_date

        st.write("Select a timeline for analysis")
        date_columns = st.columns(2)

        with date_columns[0]:
            start_date = st.date_input(
                "Select Start Date",
                start_default,
                min_value=min_date,
                max_value=max_date,
            )
            if (start_date < min_date) or (start_date > max_date):
                st.error("Please select dates in the range of the dates in the data")
                st.stop()
        end_default = (
            end_default if end_default > start_date + timedelta(days=1) else max_date
        )
        with date_columns[1]:
            end_default = (
                end_default
                if pd.Timestamp(end_default) >= pd.Timestamp(start_date)
                else start_date
            )

            end_date = st.date_input(
                "Select End Date",
                end_default,
                min_value=start_date + timedelta(days=1),
                max_value=max_date,
            )
        if (
            (start_date < min_date)
            or (end_date < min_date)
            or (start_date > max_date)
            or (end_date > max_date)
        ):
            st.error("Please select dates in the range of the dates in the data")
            st.stop()
        if end_date < start_date + timedelta(days=1):
            st.error("Please select end date after start date")
            st.stop()

        st.session_state["project_dct"]["media_performance"]["start_date"] = start_date
        st.session_state["project_dct"]["media_performance"]["end_date"] = end_date

        st.header("Overview of Previous Media Spend")

        initialize_data_cmp(target_col, is_panel, panel_col, start_date, end_date)
        scenario = st.session_state["scenario"]
        raw_df = st.session_state["raw_df"]

        columns = st.columns(2)

        with columns[0]:
            st.metric(
                label="Media Spend",
                value=format_numbers(float(scenario.actual_total_spends)),
            )
        ###print(f"##################### {scenario.actual_total_sales} ##################")
        with columns[1]:
            st.metric(
                label=sel_target_col_frmttd,
                value=format_numbers(
                    float(scenario.actual_total_sales), include_indicator=False
                ),
            )

        actual_summary_df = create_channel_summary(scenario, sel_target_col_frmttd)
        actual_summary_df["Channel"] = actual_summary_df["Channel"].apply(
            channel_name_formating
        )

        columns = st.columns((3, 1))
        with columns[0]:
            with st.expander("Channel wise overview"):
                st.markdown(
                    actual_summary_df.style.set_table_styles(
                        [
                            {
                                "selector": "th",
                                "props": [("background-color", "#f6dcc7")],
                            },
                            {
                                "selector": "tr:nth-child(even)",
                                "props": [("background-color", "#f6dcc7")],
                            },
                        ]
                    ).to_html(),
                    unsafe_allow_html=True,
                )

        st.markdown("<hr>", unsafe_allow_html=True)
        ##############################

        st.plotly_chart(
            create_contribution_pie(scenario, sel_target_col_frmttd),
            use_container_width=True,
        )
        st.markdown("<hr>", unsafe_allow_html=True)

        ################################3
        st.plotly_chart(
            create_contribuion_stacked_plot(scenario, sel_target_col_frmttd),
            use_container_width=True,
        )
        st.markdown("<hr>", unsafe_allow_html=True)
        #######################################

        selected_channel_name = st.selectbox(
            "Channel",
            st.session_state["channels_list"] + ["non media"],
            format_func=channel_name_formating,
        )
        selected_channel = scenario.channels.get(selected_channel_name, None)

        st.plotly_chart(
            create_channel_spends_sales_plot(selected_channel, sel_target_col_frmttd),
            use_container_width=True,
        )

        st.markdown("<hr>", unsafe_allow_html=True)

        if st.button("Save this session", use_container_width=True):

            project_dct_pkl = pickle.dumps(st.session_state["project_dct"])

            update_db(
                st.session_state["project_number"],
                "Current_Media_Performance",
                "project_dct",
                project_dct_pkl,
                schema,
                # resp_mtrc=None,
            )  # db

            st.success("Session Saved!")

        # Remove "response_metric_" from the start and "_total" from the end
        if str(target_col).startswith("response_metric_"):
            target_col = target_col.replace("response_metric_", "", 1)

        # Remove the last 6 characters (length of "_total")
        if str(target_col).endswith("_total"):
            target_col = target_col[:-6]

        if (
            st.session_state["project_dct"]["current_media_performance"][
                "model_outputs"
            ][target_col]
            is not None
        ):
            if (
                len(
                    st.session_state["project_dct"]["current_media_performance"][
                        "model_outputs"
                    ][target_col]["contribution_data"]
                )
                > 0
            ):
                st.download_button(
                    label="Download Contribution File",
                    data=st.session_state["project_dct"]["current_media_performance"][
                        "model_outputs"
                    ][target_col]["contribution_data"].to_csv(),
                    file_name="contributions.csv",
                    key="dwnld_contr",
                )
except:
    exc_type, exc_value, exc_traceback = sys.exc_info()
    error_message = "".join(
        traceback.format_exception(exc_type, exc_value, exc_traceback)
    )
    log_message("error", f"Error: {error_message}", "Current Media Performance")
    st.warning("An error occured, please try again", icon="⚠️")