|
import streamlit as st |
|
st.set_page_config(page_title="WhatsApp Chat Analyzer", layout="wide") |
|
|
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import preprocessor, helper |
|
from sentiment import predict_sentiment_batch |
|
import os |
|
os.environ["STREAMLIT_SERVER_RUN_ON_SAVE"] = "false" |
|
|
|
|
|
st.markdown( |
|
""" |
|
<style> |
|
.main {background-color: #f0f2f6;} |
|
</style> |
|
""", |
|
unsafe_allow_html=True |
|
) |
|
|
|
|
|
sns.set_theme(style="whitegrid") |
|
|
|
st.title("π WhatsApp Chat Sentiment Analysis Dashboard") |
|
st.subheader('Instructions') |
|
st.markdown("1. Open the sidebar and upload your WhatsApp chat file in .txt format.") |
|
st.markdown("2. Wait for the initial processing (minimal delay).") |
|
st.markdown("3. Customize the analysis by selecting users or filters.") |
|
st.markdown("4. Click 'Show Analysis' for detailed results.") |
|
|
|
st.sidebar.title("Whatsapp Chat Analyzer") |
|
uploaded_file = st.sidebar.file_uploader("Upload your chat file (.txt)", type="txt") |
|
|
|
@st.cache_data |
|
def load_and_preprocess(file_content): |
|
return preprocessor.preprocess(file_content) |
|
|
|
if uploaded_file is not None: |
|
raw_data = uploaded_file.read().decode("utf-8") |
|
with st.spinner("Loading chat data..."): |
|
df, _ = load_and_preprocess(raw_data) |
|
st.session_state.df = df |
|
|
|
st.sidebar.header("π Filters") |
|
user_list = ["Overall"] + sorted(df["user"].unique().tolist()) |
|
selected_user = st.sidebar.selectbox("Select User", user_list) |
|
|
|
df_filtered = df if selected_user == "Overall" else df[df["user"] == selected_user] |
|
|
|
if st.sidebar.button("Show Analysis"): |
|
if df_filtered.empty: |
|
st.warning(f"No data found for user: {selected_user}") |
|
else: |
|
with st.spinner("Analyzing..."): |
|
if 'sentiment' not in df_filtered.columns: |
|
try: |
|
print("Starting sentiment analysis...") |
|
|
|
message_list = df_filtered["message"].astype(str).tolist() |
|
message_list = [msg for msg in message_list if msg.strip()] |
|
|
|
print(f"Processing {len(message_list)} messages") |
|
print(f"Sample messages: {message_list[:5]}") |
|
|
|
|
|
df_filtered['sentiment'] = predict_sentiment_batch(message_list) |
|
print("Sentiment analysis completed successfully") |
|
|
|
except Exception as e: |
|
st.error(f"Sentiment analysis failed: {str(e)}") |
|
print(f"Full error: {str(e)}") |
|
|
|
st.session_state.df_filtered = df_filtered |
|
else: |
|
st.session_state.df_filtered = df_filtered |
|
|
|
|
|
num_messages, words, num_media, num_links = helper.fetch_stats(selected_user, df_filtered) |
|
st.title("Top Statistics") |
|
col1, col2, col3, col4 = st.columns(4) |
|
with col1: |
|
st.header("Total Messages") |
|
st.title(num_messages) |
|
with col2: |
|
st.header("Total Words") |
|
st.title(words) |
|
with col3: |
|
st.header("Media Shared") |
|
st.title(num_media) |
|
with col4: |
|
st.header("Links Shared") |
|
st.title(num_links) |
|
|
|
st.title("Monthly Timeline") |
|
timeline = helper.monthly_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered)))) |
|
if not timeline.empty: |
|
plt.figure(figsize=(10, 5)) |
|
sns.lineplot(data=timeline, x='time', y='message', color='green') |
|
plt.title("Monthly Timeline") |
|
plt.xlabel("Date") |
|
plt.ylabel("Messages") |
|
st.pyplot(plt) |
|
plt.clf() |
|
|
|
st.title("Daily Timeline") |
|
daily_timeline = helper.daily_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered)))) |
|
if not daily_timeline.empty: |
|
plt.figure(figsize=(10, 5)) |
|
sns.lineplot(data=daily_timeline, x='date', y='message', color='black') |
|
plt.title("Daily Timeline") |
|
plt.xlabel("Date") |
|
plt.ylabel("Messages") |
|
st.pyplot(plt) |
|
plt.clf() |
|
|
|
st.title("Activity Map") |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
st.header("Most Busy Day") |
|
busy_day = helper.week_activity_map(selected_user, df_filtered) |
|
if not busy_day.empty: |
|
plt.figure(figsize=(10, 5)) |
|
sns.barplot(x=busy_day.index, y=busy_day.values, palette="Purples_r") |
|
plt.title("Most Busy Day") |
|
plt.xlabel("Day of Week") |
|
plt.ylabel("Message Count") |
|
st.pyplot(plt) |
|
plt.clf() |
|
with col2: |
|
st.header("Most Busy Month") |
|
busy_month = helper.month_activity_map(selected_user, df_filtered) |
|
if not busy_month.empty: |
|
plt.figure(figsize=(10, 5)) |
|
sns.barplot(x=busy_month.index, y=busy_month.values, palette="Oranges_r") |
|
plt.title("Most Busy Month") |
|
plt.xlabel("Month") |
|
plt.ylabel("Message Count") |
|
st.pyplot(plt) |
|
plt.clf() |
|
|
|
if selected_user == 'Overall': |
|
st.title("Most Busy Users") |
|
x, new_df = helper.most_busy_users(df_filtered) |
|
if not x.empty: |
|
plt.figure(figsize=(10, 5)) |
|
sns.barplot(x=x.index, y=x.values, palette="Reds_r") |
|
plt.title("Most Busy Users") |
|
plt.xlabel("User") |
|
plt.ylabel("Message Count") |
|
plt.xticks(rotation=45) |
|
st.pyplot(plt) |
|
st.title("Word Count by User") |
|
plt.clf() |
|
st.dataframe(new_df) |
|
|
|
|
|
st.title("Most Common Words") |
|
most_common_df = helper.most_common_words(selected_user, df_filtered) |
|
if not most_common_df.empty: |
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
sns.barplot(y=most_common_df[0], x=most_common_df[1], ax=ax, palette="Blues_r") |
|
ax.set_title("Top 20 Most Common Words") |
|
ax.set_xlabel("Frequency") |
|
ax.set_ylabel("Words") |
|
plt.xticks(rotation='vertical') |
|
st.pyplot(fig) |
|
plt.clf() |
|
else: |
|
st.warning("No data available for most common words.") |
|
|
|
|
|
st.title("Emoji Analysis") |
|
emoji_df = helper.emoji_helper(selected_user, df_filtered) |
|
if not emoji_df.empty: |
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.subheader("Top Emojis Used") |
|
st.dataframe(emoji_df) |
|
|
|
with col2: |
|
fig, ax = plt.subplots(figsize=(8, 8)) |
|
ax.pie(emoji_df[1].head(), labels=emoji_df[0].head(), |
|
autopct="%0.2f%%", startangle=90, |
|
colors=sns.color_palette("pastel")) |
|
ax.set_title("Top Emoji Distribution") |
|
st.pyplot(fig) |
|
plt.clf() |
|
else: |
|
st.warning("No data available for emoji analysis.") |
|
|
|
|
|
st.title("π Sentiment Analysis") |
|
|
|
|
|
month_map = { |
|
'January': 'Jan', 'February': 'Feb', 'March': 'Mar', 'April': 'Apr', |
|
'May': 'May', 'June': 'Jun', 'July': 'Jul', 'August': 'Aug', |
|
'September': 'Sep', 'October': 'Oct', 'November': 'Nov', 'December': 'Dec' |
|
} |
|
df_filtered['month'] = df_filtered['month'].map(month_map) |
|
|
|
|
|
monthly_sentiment = df_filtered.groupby(['month', 'sentiment']).size().unstack(fill_value=0) |
|
|
|
|
|
st.write("### Sentiment Count by Month (Histogram)") |
|
|
|
|
|
fig, axes = plt.subplots(1, 3, figsize=(18, 5)) |
|
|
|
|
|
if 'positive' in monthly_sentiment: |
|
axes[0].bar(monthly_sentiment.index, monthly_sentiment['positive'], color='green') |
|
axes[0].set_title('Positive Sentiment') |
|
axes[0].set_xlabel('Month') |
|
axes[0].set_ylabel('Count') |
|
|
|
|
|
if 'neutral' in monthly_sentiment: |
|
axes[1].bar(monthly_sentiment.index, monthly_sentiment['neutral'], color='blue') |
|
axes[1].set_title('Neutral Sentiment') |
|
axes[1].set_xlabel('Month') |
|
axes[1].set_ylabel('Count') |
|
|
|
|
|
if 'negative' in monthly_sentiment: |
|
axes[2].bar(monthly_sentiment.index, monthly_sentiment['negative'], color='red') |
|
axes[2].set_title('Negative Sentiment') |
|
axes[2].set_xlabel('Month') |
|
axes[2].set_ylabel('Count') |
|
|
|
|
|
st.pyplot(fig) |
|
plt.clf() |
|
|
|
|
|
sentiment_counts = df_filtered.groupby(['day_of_week', 'sentiment']).size().unstack(fill_value=0) |
|
|
|
|
|
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] |
|
sentiment_counts = sentiment_counts.reindex(day_order) |
|
|
|
|
|
st.write("### Daily Sentiment Analysis") |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 5)) |
|
sentiment_counts.plot(kind='bar', stacked=False, ax=ax, color=['red', 'blue', 'green']) |
|
|
|
|
|
ax.set_xlabel("Day of the Week") |
|
ax.set_ylabel("Count") |
|
ax.set_title("Sentiment Distribution per Day of the Week") |
|
ax.legend(title="Sentiment") |
|
|
|
|
|
st.pyplot(fig) |
|
plt.clf() |
|
|
|
|
|
if selected_user == 'Overall': |
|
sentiment_counts = df_filtered.groupby(['user', 'sentiment']).size().reset_index(name='Count') |
|
|
|
|
|
total_per_sentiment = df_filtered['sentiment'].value_counts().to_dict() |
|
|
|
|
|
sentiment_counts['Percentage'] = sentiment_counts.apply( |
|
lambda row: (row['Count'] / total_per_sentiment[row['sentiment']]) * 100, axis=1 |
|
) |
|
|
|
|
|
positive_df = sentiment_counts[sentiment_counts['sentiment'] == 'positive'].sort_values(by='Count', ascending=False).head(10) |
|
neutral_df = sentiment_counts[sentiment_counts['sentiment'] == 'neutral'].sort_values(by='Count', ascending=False).head(10) |
|
negative_df = sentiment_counts[sentiment_counts['sentiment'] == 'negative'].sort_values(by='Count', ascending=False).head(10) |
|
|
|
|
|
st.write("### Sentiment Contribution by User") |
|
|
|
|
|
col1, col2, col3 = st.columns(3) |
|
|
|
|
|
with col1: |
|
st.subheader("Top Positive Contributors") |
|
if not positive_df.empty: |
|
st.dataframe(positive_df[['user', 'Count', 'Percentage']]) |
|
else: |
|
st.warning("No positive sentiment data") |
|
|
|
|
|
with col2: |
|
st.subheader("Top Neutral Contributors") |
|
if not neutral_df.empty: |
|
st.dataframe(neutral_df[['user', 'Count', 'Percentage']]) |
|
else: |
|
st.warning("No neutral sentiment data") |
|
|
|
|
|
with col3: |
|
st.subheader("Top Negative Contributors") |
|
if not negative_df.empty: |
|
st.dataframe(negative_df[['user', 'Count', 'Percentage']]) |
|
else: |
|
st.warning("No negative sentiment data") |
|
|
|
|
|
st.title("π Area of Focus: Topic Analysis") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.header("Topic Distribution") |
|
try: |
|
fig = helper.plot_topic_distribution(df_filtered) |
|
st.pyplot(fig) |
|
plt.clf() |
|
except Exception as e: |
|
st.warning(f"Could not display topic distribution: {str(e)}") |
|
|
|
|
|
st.header("Sample Messages for Each Topic") |
|
if 'topic' in df_filtered.columns: |
|
for topic_id in sorted(df_filtered['topic'].unique()): |
|
st.subheader(f"Topic {topic_id}") |
|
|
|
|
|
filtered_messages = df_filtered[df_filtered['topic'] == topic_id]['message'] |
|
|
|
|
|
sample_size = min(5, len(filtered_messages)) |
|
|
|
if sample_size > 0: |
|
sample_messages = filtered_messages.sample(sample_size, replace=False).tolist() |
|
for msg in sample_messages: |
|
st.write(f"- {msg}") |
|
else: |
|
st.write("No messages available for this topic.") |
|
else: |
|
st.warning("Topic information not available") |
|
|
|
|
|
st.header("π
Topic Trends Over Time") |
|
|
|
|
|
time_freq = st.selectbox("Select Time Frequency", ["Daily", "Weekly", "Monthly"], key='time_freq') |
|
|
|
|
|
try: |
|
freq_map = {"Daily": "D", "Weekly": "W", "Monthly": "M"} |
|
topic_distribution = helper.topic_distribution_over_time(df_filtered, time_freq=freq_map[time_freq]) |
|
|
|
|
|
use_plotly = st.checkbox("Use interactive visualization", value=True, key='use_plotly') |
|
|
|
if use_plotly: |
|
fig = helper.plot_topic_distribution_over_time_plotly(topic_distribution) |
|
st.plotly_chart(fig, use_container_width=True) |
|
else: |
|
fig = helper.plot_topic_distribution_over_time(topic_distribution) |
|
st.pyplot(fig) |
|
plt.clf() |
|
except Exception as e: |
|
st.warning(f"Could not display topic trends: {str(e)}") |
|
|
|
|
|
st.title("π§© Conversation Clusters") |
|
|
|
|
|
n_clusters = st.slider("Select number of clusters", |
|
min_value=2, |
|
max_value=10, |
|
value=5, |
|
key='n_clusters') |
|
|
|
|
|
with st.spinner("Analyzing conversation clusters..."): |
|
try: |
|
df_clustered, reduced_features, _ = preprocessor.preprocess_for_clustering(df_filtered, n_clusters=n_clusters) |
|
|
|
|
|
st.header("Cluster Visualization") |
|
fig = helper.plot_clusters(reduced_features, df_clustered['cluster']) |
|
st.pyplot(fig) |
|
plt.clf() |
|
|
|
|
|
st.header("π Cluster Insights") |
|
|
|
|
|
st.subheader("1. Dominant Themes") |
|
cluster_labels = helper.get_cluster_labels(df_clustered, n_clusters) |
|
for cluster_id, label in cluster_labels.items(): |
|
st.write(f"**Cluster {cluster_id}**: {label}") |
|
|
|
|
|
st.subheader("2. Temporal Patterns") |
|
temporal_trends = helper.get_temporal_trends(df_clustered) |
|
for cluster_id, trend in temporal_trends.items(): |
|
st.write(f"**Cluster {cluster_id}**: Peaks on {trend['peak_day']} around {trend['peak_time']}") |
|
|
|
|
|
if selected_user == 'Overall': |
|
st.subheader("3. Top Contributors") |
|
user_contributions = helper.get_user_contributions(df_clustered) |
|
for cluster_id, users in user_contributions.items(): |
|
st.write(f"**Cluster {cluster_id}**: {', '.join(users[:3])}...") |
|
|
|
|
|
st.subheader("4. Sentiment Analysis") |
|
sentiment_by_cluster = helper.get_sentiment_by_cluster(df_clustered) |
|
for cluster_id, sentiment in sentiment_by_cluster.items(): |
|
st.write(f"**Cluster {cluster_id}**: {sentiment['positive']}% positive, {sentiment['neutral']}% neutral, {sentiment['negative']}% negative") |
|
|
|
|
|
st.subheader("Sample Messages") |
|
for cluster_id in sorted(df_clustered['cluster'].unique()): |
|
with st.expander(f"Cluster {cluster_id} Messages"): |
|
cluster_msgs = df_clustered[df_clustered['cluster'] == cluster_id]['message'] |
|
sample_size = min(3, len(cluster_msgs)) |
|
if sample_size > 0: |
|
for msg in cluster_msgs.sample(sample_size, replace=False): |
|
st.write(f"- {msg}") |
|
else: |
|
st.write("No messages available") |
|
|
|
except Exception as e: |
|
st.error(f"Clustering failed: {str(e)}") |