Spaces:
Sleeping
Sleeping
import streamlit as st | |
import plotly.express as px | |
import numpy as np | |
import plotly.graph_objects as go | |
from sklearn.metrics import r2_score | |
from collections import OrderedDict | |
import plotly.express as px | |
import plotly.graph_objects as go | |
import pandas as pd | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import streamlit as st | |
import re | |
from matplotlib.colors import ListedColormap | |
# from st_aggrid import AgGrid, GridOptionsBuilder | |
# from src.agstyler import PINLEFT, PRECISION_TWO, draw_grid | |
def format_numbers(x): | |
if abs(x) >= 1e6: | |
# Format as millions with one decimal place and commas | |
return f'{x/1e6:,.1f}M' | |
elif abs(x) >= 1e3: | |
# Format as thousands with one decimal place and commas | |
return f'{x/1e3:,.1f}K' | |
else: | |
# Format with one decimal place and commas for values less than 1000 | |
return f'{x:,.1f}' | |
def line_plot(data, x_col, y1_cols, y2_cols, title): | |
""" | |
Create a line plot with two sets of y-axis data. | |
Parameters: | |
data (DataFrame): The data containing the columns to be plotted. | |
x_col (str): The column name for the x-axis. | |
y1_cols (list): List of column names for the primary y-axis. | |
y2_cols (list): List of column names for the secondary y-axis. | |
title (str): The title of the plot. | |
Returns: | |
fig (Figure): The Plotly figure object with the line plot. | |
""" | |
fig = go.Figure() | |
# Add traces for the primary y-axis | |
for y1_col in y1_cols: | |
fig.add_trace(go.Scatter(x=data[x_col], y=data[y1_col], mode='lines', name=y1_col, line=dict(color='#11B6BD'))) | |
# Add traces for the secondary y-axis | |
for y2_col in y2_cols: | |
fig.add_trace(go.Scatter(x=data[x_col], y=data[y2_col], mode='lines', name=y2_col, yaxis='y2', line=dict(color='#739FAE'))) | |
# Configure the layout for the secondary y-axis if needed | |
if len(y2_cols) != 0: | |
fig.update_layout(yaxis=dict(), yaxis2=dict(overlaying='y', side='right')) | |
else: | |
fig.update_layout(yaxis=dict(), yaxis2=dict(overlaying='y', side='right')) | |
# Add title if provided | |
if title: | |
fig.update_layout(title=title) | |
# Customize axes and legend | |
fig.update_xaxes(showgrid=False) | |
fig.update_yaxes(showgrid=False) | |
fig.update_layout(legend=dict( | |
orientation="h", | |
yanchor="top", | |
y=1.1, | |
xanchor="center", | |
x=0.5 | |
)) | |
return fig | |
def line_plot_target(df, target, title): | |
""" | |
Create a line plot with a trendline for a target column. | |
Parameters: | |
df (DataFrame): The data containing the columns to be plotted. | |
target (str): The column name for the y-axis. | |
title (str): The title of the plot. | |
Returns: | |
fig (Figure): The Plotly figure object with the line plot and trendline. | |
""" | |
# Calculate the trendline coefficients | |
coefficients = np.polyfit(df['date'].view('int64'), df[target], 1) | |
trendline = np.poly1d(coefficients) | |
fig = go.Figure() | |
# Add the target line plot | |
fig.add_trace(go.Scatter(x=df['date'], y=df[target], mode='lines', name=target, line=dict(color='#11B6BD'))) | |
# Calculate and add the trendline plot | |
trendline_x = df['date'] | |
trendline_y = trendline(df['date'].view('int64')) | |
fig.add_trace(go.Scatter(x=trendline_x, y=trendline_y, mode='lines', name='Trendline', line=dict(color='#739FAE'))) | |
# Update layout with title and x-axis type | |
fig.update_layout( | |
title=title, | |
xaxis=dict(type='date') | |
) | |
# Add vertical lines at the start of each year | |
for year in df['date'].dt.year.unique()[1:]: | |
january_1 = pd.Timestamp(year=year, month=1, day=1) | |
fig.add_shape( | |
go.layout.Shape( | |
type="line", | |
x0=january_1, | |
x1=january_1, | |
y0=0, | |
y1=1, | |
xref="x", | |
yref="paper", | |
line=dict(color="grey", width=1.5, dash="dash"), | |
) | |
) | |
# Customize the legend | |
fig.update_layout(legend=dict( | |
orientation="h", | |
yanchor="top", | |
y=1.1, | |
xanchor="center", | |
x=0.5 | |
)) | |
return fig | |
def correlation_plot(df, selected_features, target): | |
""" | |
Create a correlation heatmap plot for selected features and target column. | |
Parameters: | |
df (DataFrame): The data containing the columns to be plotted. | |
selected_features (list): List of column names to be included in the correlation plot. | |
target (str): The target column name to be included in the correlation plot. | |
Returns: | |
fig (Figure): The Matplotlib figure object with the correlation heatmap plot. | |
""" | |
# Define custom colormap | |
custom_cmap = ListedColormap(['#08083B', "#11B6BD"]) | |
# Select the relevant columns for correlation calculation | |
corr_df = df[selected_features] | |
corr_df = pd.concat([corr_df, df[target]], axis=1) | |
# Create a matplotlib figure and axis | |
fig, ax = plt.subplots(figsize=(16, 12)) | |
# Generate the heatmap with correlation coefficients | |
sns.heatmap(corr_df.corr(), annot=True, cmap='Blues', fmt=".2f", linewidths=0.5, mask=np.triu(corr_df.corr())) | |
# Customize the plot | |
plt.xticks(rotation=45) | |
plt.yticks(rotation=0) | |
return fig | |
def summary(data, selected_feature, spends, Target=None): | |
""" | |
Create a summary table of selected features and optionally a target column. | |
Parameters: | |
data (DataFrame): The data containing the columns to be summarized. | |
selected_feature (list): List of column names to be included in the summary. | |
spends (str): The column name for the spends data. | |
Target (str, optional): The target column name for additional summary calculations. Default is None. | |
Returns: | |
sum_df (DataFrame): The summary DataFrame with formatted values. | |
""" | |
if Target: | |
# Summarize data for the target column | |
sum_df = data[selected_feature] | |
sum_df['Year'] = data['date'].dt.year | |
sum_df = sum_df.groupby('Year')[selected_feature].sum().reset_index() | |
# Calculate total sum and append to the DataFrame | |
total_sum = sum_df.sum(numeric_only=True) | |
total_sum['Year'] = 'Total' | |
sum_df = pd.concat([sum_df, total_sum.to_frame().T], axis=0, ignore_index=True).copy() | |
# Set 'Year' as index and format numbers | |
sum_df.set_index(['Year'], inplace=True) | |
sum_df = sum_df.applymap(format_numbers) | |
# Format spends columns as currency | |
spends_col = [col for col in sum_df.columns if any(keyword in col for keyword in ['spends', 'cost'])] | |
for col in spends_col: | |
sum_df[col] = sum_df[col].map(lambda x: f'${x}') | |
return sum_df | |
else: | |
# Include spends in the selected features | |
selected_feature.append(spends) | |
# Ensure unique features | |
selected_feature = list(set(selected_feature)) | |
if len(selected_feature) > 1: | |
imp_clicks = selected_feature[1] | |
spends_col = selected_feature[0] | |
# Summarize data for the selected features | |
sum_df = data[selected_feature] | |
sum_df['Year'] = data['date'].dt.year | |
sum_df = sum_df.groupby('Year')[selected_feature].agg('sum') | |
# Calculate CPM/CPC | |
sum_df['CPM/CPC'] = (sum_df[spends_col] / sum_df[imp_clicks]) * 1000 | |
# Calculate grand total and append to the DataFrame | |
sum_df.loc['Grand Total'] = sum_df.sum() | |
# Format numbers and replace NaNs | |
sum_df = sum_df.applymap(format_numbers) | |
sum_df.fillna('-', inplace=True) | |
sum_df = sum_df.replace({"0.0": '-', 'nan': '-'}) | |
# Format spends columns as currency | |
sum_df[spends_col] = sum_df[spends_col].map(lambda x: f'${x}') | |
return sum_df | |
else: | |
# Summarize data for a single selected feature | |
sum_df = data[selected_feature] | |
sum_df['Year'] = data['date'].dt.year | |
sum_df = sum_df.groupby('Year')[selected_feature].agg('sum') | |
# Calculate grand total and append to the DataFrame | |
sum_df.loc['Grand Total'] = sum_df.sum() | |
# Format numbers and replace NaNs | |
sum_df = sum_df.applymap(format_numbers) | |
sum_df.fillna('-', inplace=True) | |
sum_df = sum_df.replace({"0.0": '-', 'nan': '-'}) | |
# Format spends columns as currency | |
spends_col = [col for col in sum_df.columns if any(keyword in col for keyword in ['spends', 'cost'])] | |
for col in spends_col: | |
sum_df[col] = sum_df[col].map(lambda x: f'${x}') | |
return sum_df | |
def sanitize_key(key, prefix=""): | |
# Use regular expressions to remove non-alphanumeric characters and spaces | |
key = re.sub(r'[^a-zA-Z0-9]', '', key) | |
return f"{prefix}{key}" | |