Spaces:
Running
Running
import streamlit as st | |
st.set_page_config(page_title="WhatsApp Chat Analyzer", layout="wide") | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import preprocessor, helper | |
from sentiment import predict_sentiment_batch | |
import os | |
os.environ["STREAMLIT_SERVER_RUN_ON_SAVE"] = "false" | |
# Theme customization | |
st.markdown( | |
""" | |
<style> | |
.main {background-color: #f0f2f6;} | |
</style> | |
""", | |
unsafe_allow_html=True | |
) | |
# Set seaborn style | |
sns.set_theme(style="whitegrid") | |
st.title("π WhatsApp Chat Sentiment Analysis Dashboard") | |
st.subheader('Instructions') | |
st.markdown("1. Open the sidebar and upload your WhatsApp chat file in .txt format.") | |
st.markdown("2. Wait for the initial processing (minimal delay).") | |
st.markdown("3. Customize the analysis by selecting users or filters.") | |
st.markdown("4. Click 'Show Analysis' for detailed results.") | |
st.sidebar.title("Whatsapp Chat Analyzer") | |
uploaded_file = st.sidebar.file_uploader("Upload your chat file (.txt)", type="txt") | |
def load_and_preprocess(file_content): | |
return preprocessor.preprocess(file_content) | |
if uploaded_file is not None: | |
raw_data = uploaded_file.read().decode("utf-8") | |
with st.spinner("Loading chat data..."): | |
df, _ = load_and_preprocess(raw_data) | |
st.session_state.df = df | |
st.sidebar.header("π Filters") | |
user_list = ["Overall"] + sorted(df["user"].unique().tolist()) | |
selected_user = st.sidebar.selectbox("Select User", user_list) | |
df_filtered = df if selected_user == "Overall" else df[df["user"] == selected_user] | |
if st.sidebar.button("Show Analysis"): | |
if df_filtered.empty: | |
st.warning(f"No data found for user: {selected_user}") | |
else: | |
with st.spinner("Analyzing..."): | |
if 'sentiment' not in df_filtered.columns: | |
try: | |
print("Starting sentiment analysis...") | |
# Get messages as clean strings | |
message_list = df_filtered["message"].astype(str).tolist() | |
message_list = [msg for msg in message_list if msg.strip()] | |
print(f"Processing {len(message_list)} messages") | |
print(f"Sample messages: {message_list[:5]}") | |
# Directly call the sentiment analysis function | |
df_filtered['sentiment'] = predict_sentiment_batch(message_list) | |
print("Sentiment analysis completed successfully") | |
except Exception as e: | |
st.error(f"Sentiment analysis failed: {str(e)}") | |
print(f"Full error: {str(e)}") | |
st.session_state.df_filtered = df_filtered | |
else: | |
st.session_state.df_filtered = df_filtered | |
# Display statistics and visualizations | |
num_messages, words, num_media, num_links = helper.fetch_stats(selected_user, df_filtered) | |
st.title("Top Statistics") | |
col1, col2, col3, col4 = st.columns(4) | |
with col1: | |
st.header("Total Messages") | |
st.title(num_messages) | |
with col2: | |
st.header("Total Words") | |
st.title(words) | |
with col3: | |
st.header("Media Shared") | |
st.title(num_media) | |
with col4: | |
st.header("Links Shared") | |
st.title(num_links) | |
st.title("Monthly Timeline") | |
timeline = helper.monthly_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered)))) | |
if not timeline.empty: | |
plt.figure(figsize=(10, 5)) | |
sns.lineplot(data=timeline, x='time', y='message', color='green') | |
plt.title("Monthly Timeline") | |
plt.xlabel("Date") | |
plt.ylabel("Messages") | |
st.pyplot(plt) | |
plt.clf() | |
st.title("Daily Timeline") | |
daily_timeline = helper.daily_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered)))) | |
if not daily_timeline.empty: | |
plt.figure(figsize=(10, 5)) | |
sns.lineplot(data=daily_timeline, x='date', y='message', color='black') | |
plt.title("Daily Timeline") | |
plt.xlabel("Date") | |
plt.ylabel("Messages") | |
st.pyplot(plt) | |
plt.clf() | |
st.title("Activity Map") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.header("Most Busy Day") | |
busy_day = helper.week_activity_map(selected_user, df_filtered) | |
if not busy_day.empty: | |
plt.figure(figsize=(10, 5)) | |
sns.barplot(x=busy_day.index, y=busy_day.values, palette="Purples_r") | |
plt.title("Most Busy Day") | |
plt.xlabel("Day of Week") | |
plt.ylabel("Message Count") | |
st.pyplot(plt) | |
plt.clf() | |
with col2: | |
st.header("Most Busy Month") | |
busy_month = helper.month_activity_map(selected_user, df_filtered) | |
if not busy_month.empty: | |
plt.figure(figsize=(10, 5)) | |
sns.barplot(x=busy_month.index, y=busy_month.values, palette="Oranges_r") | |
plt.title("Most Busy Month") | |
plt.xlabel("Month") | |
plt.ylabel("Message Count") | |
st.pyplot(plt) | |
plt.clf() | |
if selected_user == 'Overall': | |
st.title("Most Busy Users") | |
x, new_df = helper.most_busy_users(df_filtered) | |
if not x.empty: | |
plt.figure(figsize=(10, 5)) | |
sns.barplot(x=x.index, y=x.values, palette="Reds_r") | |
plt.title("Most Busy Users") | |
plt.xlabel("User") | |
plt.ylabel("Message Count") | |
plt.xticks(rotation=45) | |
st.pyplot(plt) | |
st.title("Word Count by User") | |
plt.clf() | |
st.dataframe(new_df) | |
# Most common words analysis | |
st.title("Most Common Words") | |
most_common_df = helper.most_common_words(selected_user, df_filtered) | |
if not most_common_df.empty: | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
sns.barplot(y=most_common_df[0], x=most_common_df[1], ax=ax, palette="Blues_r") | |
ax.set_title("Top 20 Most Common Words") | |
ax.set_xlabel("Frequency") | |
ax.set_ylabel("Words") | |
plt.xticks(rotation='vertical') | |
st.pyplot(fig) | |
plt.clf() | |
else: | |
st.warning("No data available for most common words.") | |
# Emoji analysis | |
st.title("Emoji Analysis") | |
emoji_df = helper.emoji_helper(selected_user, df_filtered) | |
if not emoji_df.empty: | |
col1, col2 = st.columns(2) | |
with col1: | |
st.subheader("Top Emojis Used") | |
st.dataframe(emoji_df) | |
with col2: | |
fig, ax = plt.subplots(figsize=(8, 8)) | |
ax.pie(emoji_df[1].head(), labels=emoji_df[0].head(), | |
autopct="%0.2f%%", startangle=90, | |
colors=sns.color_palette("pastel")) | |
ax.set_title("Top Emoji Distribution") | |
st.pyplot(fig) | |
plt.clf() | |
else: | |
st.warning("No data available for emoji analysis.") | |
# Sentiment Analysis Visualizations | |
st.title("π Sentiment Analysis") | |
# Convert month names to abbreviated format | |
month_map = { | |
'January': 'Jan', 'February': 'Feb', 'March': 'Mar', 'April': 'Apr', | |
'May': 'May', 'June': 'Jun', 'July': 'Jul', 'August': 'Aug', | |
'September': 'Sep', 'October': 'Oct', 'November': 'Nov', 'December': 'Dec' | |
} | |
df_filtered['month'] = df_filtered['month'].map(month_map) | |
# Group by month and sentiment | |
monthly_sentiment = df_filtered.groupby(['month', 'sentiment']).size().unstack(fill_value=0) | |
# Plotting: Histogram (Bar Chart) for each sentiment | |
st.write("### Sentiment Count by Month (Histogram)") | |
# Create a figure with subplots for each sentiment | |
fig, axes = plt.subplots(1, 3, figsize=(18, 5)) | |
# Plot Positive Sentiment | |
if 'positive' in monthly_sentiment: | |
axes[0].bar(monthly_sentiment.index, monthly_sentiment['positive'], color='green') | |
axes[0].set_title('Positive Sentiment') | |
axes[0].set_xlabel('Month') | |
axes[0].set_ylabel('Count') | |
# Plot Neutral Sentiment | |
if 'neutral' in monthly_sentiment: | |
axes[1].bar(monthly_sentiment.index, monthly_sentiment['neutral'], color='blue') | |
axes[1].set_title('Neutral Sentiment') | |
axes[1].set_xlabel('Month') | |
axes[1].set_ylabel('Count') | |
# Plot Negative Sentiment | |
if 'negative' in monthly_sentiment: | |
axes[2].bar(monthly_sentiment.index, monthly_sentiment['negative'], color='red') | |
axes[2].set_title('Negative Sentiment') | |
axes[2].set_xlabel('Month') | |
axes[2].set_ylabel('Count') | |
# Display the plots in Streamlit | |
st.pyplot(fig) | |
plt.clf() | |
# Count sentiments per day of the week | |
sentiment_counts = df_filtered.groupby(['day_of_week', 'sentiment']).size().unstack(fill_value=0) | |
# Sort days correctly | |
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] | |
sentiment_counts = sentiment_counts.reindex(day_order) | |
# Daily Sentiment Analysis | |
st.write("### Daily Sentiment Analysis") | |
# Create a Matplotlib figure | |
fig, ax = plt.subplots(figsize=(10, 5)) | |
sentiment_counts.plot(kind='bar', stacked=False, ax=ax, color=['red', 'blue', 'green']) | |
# Customize the plot | |
ax.set_xlabel("Day of the Week") | |
ax.set_ylabel("Count") | |
ax.set_title("Sentiment Distribution per Day of the Week") | |
ax.legend(title="Sentiment") | |
# Display the plot in Streamlit | |
st.pyplot(fig) | |
plt.clf() | |
# Count messages per user per sentiment (only for Overall view) | |
if selected_user == 'Overall': | |
sentiment_counts = df_filtered.groupby(['user', 'sentiment']).size().reset_index(name='Count') | |
# Calculate total messages per sentiment | |
total_per_sentiment = df_filtered['sentiment'].value_counts().to_dict() | |
# Add percentage column | |
sentiment_counts['Percentage'] = sentiment_counts.apply( | |
lambda row: (row['Count'] / total_per_sentiment[row['sentiment']]) * 100, axis=1 | |
) | |
# Separate tables for each sentiment | |
positive_df = sentiment_counts[sentiment_counts['sentiment'] == 'positive'].sort_values(by='Count', ascending=False).head(10) | |
neutral_df = sentiment_counts[sentiment_counts['sentiment'] == 'neutral'].sort_values(by='Count', ascending=False).head(10) | |
negative_df = sentiment_counts[sentiment_counts['sentiment'] == 'negative'].sort_values(by='Count', ascending=False).head(10) | |
# Sentiment Contribution Analysis | |
st.write("### Sentiment Contribution by User") | |
# Create three columns for side-by-side display | |
col1, col2, col3 = st.columns(3) | |
# Display Positive Table | |
with col1: | |
st.subheader("Top Positive Contributors") | |
if not positive_df.empty: | |
st.dataframe(positive_df[['user', 'Count', 'Percentage']]) | |
else: | |
st.warning("No positive sentiment data") | |
# Display Neutral Table | |
with col2: | |
st.subheader("Top Neutral Contributors") | |
if not neutral_df.empty: | |
st.dataframe(neutral_df[['user', 'Count', 'Percentage']]) | |
else: | |
st.warning("No neutral sentiment data") | |
# Display Negative Table | |
with col3: | |
st.subheader("Top Negative Contributors") | |
if not negative_df.empty: | |
st.dataframe(negative_df[['user', 'Count', 'Percentage']]) | |
else: | |
st.warning("No negative sentiment data") | |
# Topic Analysis Section | |
st.title("π Area of Focus: Topic Analysis") | |
# Check if topic column exists, otherwise perform topic modeling | |
# if 'topic' not in df_filtered.columns: | |
# with st.spinner("Performing topic modeling..."): | |
# try: | |
# # Add topic modeling here or ensure your helper functions handle it | |
# df_filtered = helper.perform_topic_modeling(df_filtered) | |
# except Exception as e: | |
# st.error(f"Topic modeling failed: {str(e)}") | |
# st.stop() | |
# Plot Topic Distribution | |
st.header("Topic Distribution") | |
try: | |
fig = helper.plot_topic_distribution(df_filtered) | |
st.pyplot(fig) | |
plt.clf() | |
except Exception as e: | |
st.warning(f"Could not display topic distribution: {str(e)}") | |
# Display Sample Messages for Each Topic | |
st.header("Sample Messages for Each Topic") | |
if 'topic' in df_filtered.columns: | |
for topic_id in sorted(df_filtered['topic'].unique()): | |
st.subheader(f"Topic {topic_id}") | |
# Get messages for the current topic | |
filtered_messages = df_filtered[df_filtered['topic'] == topic_id]['message'] | |
# Determine sample size | |
sample_size = min(5, len(filtered_messages)) | |
if sample_size > 0: | |
sample_messages = filtered_messages.sample(sample_size, replace=False).tolist() | |
for msg in sample_messages: | |
st.write(f"- {msg}") | |
else: | |
st.write("No messages available for this topic.") | |
else: | |
st.warning("Topic information not available") | |
# Topic Distribution Over Time | |
st.header("π Topic Trends Over Time") | |
# Add time frequency selector | |
time_freq = st.selectbox("Select Time Frequency", ["Daily", "Weekly", "Monthly"], key='time_freq') | |
# Plot topic trends | |
try: | |
freq_map = {"Daily": "D", "Weekly": "W", "Monthly": "M"} | |
topic_distribution = helper.topic_distribution_over_time(df_filtered, time_freq=freq_map[time_freq]) | |
# Choose between static and interactive plot | |
use_plotly = st.checkbox("Use interactive visualization", value=True, key='use_plotly') | |
if use_plotly: | |
fig = helper.plot_topic_distribution_over_time_plotly(topic_distribution) | |
st.plotly_chart(fig, use_container_width=True) | |
else: | |
fig = helper.plot_topic_distribution_over_time(topic_distribution) | |
st.pyplot(fig) | |
plt.clf() | |
except Exception as e: | |
st.warning(f"Could not display topic trends: {str(e)}") | |
# Clustering Analysis Section | |
st.title("π§© Conversation Clusters") | |
# Number of clusters input | |
n_clusters = st.slider("Select number of clusters", | |
min_value=2, | |
max_value=10, | |
value=5, | |
key='n_clusters') | |
# Perform clustering | |
with st.spinner("Analyzing conversation clusters..."): | |
try: | |
df_clustered, reduced_features, _ = preprocessor.preprocess_for_clustering(df_filtered, n_clusters=n_clusters) | |
# Plot clusters | |
st.header("Cluster Visualization") | |
fig = helper.plot_clusters(reduced_features, df_clustered['cluster']) | |
st.pyplot(fig) | |
plt.clf() | |
# Cluster Insights | |
st.header("π Cluster Insights") | |
# 1. Dominant Conversation Themes | |
st.subheader("1. Dominant Themes") | |
cluster_labels = helper.get_cluster_labels(df_clustered, n_clusters) | |
for cluster_id, label in cluster_labels.items(): | |
st.write(f"**Cluster {cluster_id}**: {label}") | |
# 2. Temporal Patterns | |
st.subheader("2. Temporal Patterns") | |
temporal_trends = helper.get_temporal_trends(df_clustered) | |
for cluster_id, trend in temporal_trends.items(): | |
st.write(f"**Cluster {cluster_id}**: Peaks on {trend['peak_day']} around {trend['peak_time']}") | |
# 3. User Contributions | |
if selected_user == 'Overall': | |
st.subheader("3. Top Contributors") | |
user_contributions = helper.get_user_contributions(df_clustered) | |
for cluster_id, users in user_contributions.items(): | |
st.write(f"**Cluster {cluster_id}**: {', '.join(users[:3])}...") | |
# 4. Sentiment by Cluster | |
st.subheader("4. Sentiment Analysis") | |
sentiment_by_cluster = helper.get_sentiment_by_cluster(df_clustered) | |
for cluster_id, sentiment in sentiment_by_cluster.items(): | |
st.write(f"**Cluster {cluster_id}**: {sentiment['positive']}% positive, {sentiment['neutral']}% neutral, {sentiment['negative']}% negative") | |
# Sample messages from each cluster | |
st.subheader("Sample Messages") | |
for cluster_id in sorted(df_clustered['cluster'].unique()): | |
with st.expander(f"Cluster {cluster_id} Messages"): | |
cluster_msgs = df_clustered[df_clustered['cluster'] == cluster_id]['message'] | |
sample_size = min(3, len(cluster_msgs)) | |
if sample_size > 0: | |
for msg in cluster_msgs.sample(sample_size, replace=False): | |
st.write(f"- {msg}") | |
else: | |
st.write("No messages available") | |
except Exception as e: | |
st.error(f"Clustering failed: {str(e)}") |