File size: 21,388 Bytes
d5ba1b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
import streamlit as st
st.set_page_config(page_title="WhatsApp Chat Analyzer", layout="wide")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import preprocessor, helper
from sentiment import predict_sentiment_batch
import os
os.environ["STREAMLIT_SERVER_RUN_ON_SAVE"] = "false"

# Theme customization
st.markdown(
    """
    <style>
    .main {background-color: #f0f2f6;}
    </style>
    """,
    unsafe_allow_html=True
)

# Set seaborn style
sns.set_theme(style="whitegrid")

st.title("πŸ“Š WhatsApp Chat Sentiment Analysis Dashboard")
st.subheader('Instructions')
st.markdown("1. Open the sidebar and upload your WhatsApp chat file in .txt format.")
st.markdown("2. Wait for the initial processing (minimal delay).")
st.markdown("3. Customize the analysis by selecting users or filters.")
st.markdown("4. Click 'Show Analysis' for detailed results.")

st.sidebar.title("Whatsapp Chat Analyzer")
uploaded_file = st.sidebar.file_uploader("Upload your chat file (.txt)", type="txt")

@st.cache_data
def load_and_preprocess(file_content):
    return preprocessor.preprocess(file_content)

if uploaded_file is not None:
    raw_data = uploaded_file.read().decode("utf-8")
    with st.spinner("Loading chat data..."):
        df, _ = load_and_preprocess(raw_data)
    st.session_state.df = df

    st.sidebar.header("πŸ” Filters")
    user_list = ["Overall"] + sorted(df["user"].unique().tolist())
    selected_user = st.sidebar.selectbox("Select User", user_list)

    df_filtered = df if selected_user == "Overall" else df[df["user"] == selected_user]

    if st.sidebar.button("Show Analysis"):
        if df_filtered.empty:
            st.warning(f"No data found for user: {selected_user}")
        else:
            with st.spinner("Analyzing..."):
                if 'sentiment' not in df_filtered.columns:
                    try:
                        print("Starting sentiment analysis...")
                        # Get messages as clean strings
                        message_list = df_filtered["message"].astype(str).tolist()
                        message_list = [msg for msg in message_list if msg.strip()]
                        
                        print(f"Processing {len(message_list)} messages")
                        print(f"Sample messages: {message_list[:5]}")
                        
                        # Directly call the sentiment analysis function
                        df_filtered['sentiment'] = predict_sentiment_batch(message_list)
                        print("Sentiment analysis completed successfully")
                        
                    except Exception as e:
                        st.error(f"Sentiment analysis failed: {str(e)}")
                        print(f"Full error: {str(e)}")
                    
                    st.session_state.df_filtered = df_filtered
                else:
                    st.session_state.df_filtered = df_filtered

                # Display statistics and visualizations
                num_messages, words, num_media, num_links = helper.fetch_stats(selected_user, df_filtered)
                st.title("Top Statistics")
                col1, col2, col3, col4 = st.columns(4)
                with col1:
                    st.header("Total Messages")
                    st.title(num_messages)
                with col2:
                    st.header("Total Words")
                    st.title(words)
                with col3:
                    st.header("Media Shared")
                    st.title(num_media)
                with col4:
                    st.header("Links Shared")
                    st.title(num_links)

                st.title("Monthly Timeline")
                timeline = helper.monthly_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered))))
                if not timeline.empty:
                    plt.figure(figsize=(10, 5))
                    sns.lineplot(data=timeline, x='time', y='message', color='green')
                    plt.title("Monthly Timeline")
                    plt.xlabel("Date")
                    plt.ylabel("Messages")
                    st.pyplot(plt)
                    plt.clf()

                st.title("Daily Timeline")
                daily_timeline = helper.daily_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered))))
                if not daily_timeline.empty:
                    plt.figure(figsize=(10, 5))
                    sns.lineplot(data=daily_timeline, x='date', y='message', color='black')
                    plt.title("Daily Timeline")
                    plt.xlabel("Date")
                    plt.ylabel("Messages")
                    st.pyplot(plt)
                    plt.clf()

                st.title("Activity Map")
                col1, col2 = st.columns(2)
                with col1:
                    st.header("Most Busy Day")
                    busy_day = helper.week_activity_map(selected_user, df_filtered)
                    if not busy_day.empty:
                        plt.figure(figsize=(10, 5))
                        sns.barplot(x=busy_day.index, y=busy_day.values, palette="Purples_r")
                        plt.title("Most Busy Day")
                        plt.xlabel("Day of Week")
                        plt.ylabel("Message Count")
                        st.pyplot(plt)
                        plt.clf()
                with col2:
                    st.header("Most Busy Month")
                    busy_month = helper.month_activity_map(selected_user, df_filtered)
                    if not busy_month.empty:
                        plt.figure(figsize=(10, 5))
                        sns.barplot(x=busy_month.index, y=busy_month.values, palette="Oranges_r")
                        plt.title("Most Busy Month")
                        plt.xlabel("Month")
                        plt.ylabel("Message Count")
                        st.pyplot(plt)
                        plt.clf()

                if selected_user == 'Overall':
                    st.title("Most Busy Users")
                    x, new_df = helper.most_busy_users(df_filtered)
                    if not x.empty:
                        plt.figure(figsize=(10, 5))
                        sns.barplot(x=x.index, y=x.values, palette="Reds_r")
                        plt.title("Most Busy Users")
                        plt.xlabel("User")
                        plt.ylabel("Message Count")
                        plt.xticks(rotation=45)
                        st.pyplot(plt)
                        st.title("Word Count by User")
                        plt.clf()
                        st.dataframe(new_df)
                
                # Most common words analysis
                st.title("Most Common Words")
                most_common_df = helper.most_common_words(selected_user, df_filtered)
                if not most_common_df.empty:
                    fig, ax = plt.subplots(figsize=(10, 6))
                    sns.barplot(y=most_common_df[0], x=most_common_df[1], ax=ax, palette="Blues_r")
                    ax.set_title("Top 20 Most Common Words")
                    ax.set_xlabel("Frequency")
                    ax.set_ylabel("Words")
                    plt.xticks(rotation='vertical')
                    st.pyplot(fig)
                    plt.clf()
                else:
                    st.warning("No data available for most common words.")

                # Emoji analysis
                st.title("Emoji Analysis")
                emoji_df = helper.emoji_helper(selected_user, df_filtered)
                if not emoji_df.empty:
                    col1, col2 = st.columns(2)

                    with col1:
                        st.subheader("Top Emojis Used")
                        st.dataframe(emoji_df)
                    
                    with col2:
                        fig, ax = plt.subplots(figsize=(8, 8))
                        ax.pie(emoji_df[1].head(), labels=emoji_df[0].head(), 
                              autopct="%0.2f%%", startangle=90,
                              colors=sns.color_palette("pastel"))
                        ax.set_title("Top Emoji Distribution")
                        st.pyplot(fig)
                        plt.clf()
                else:
                    st.warning("No data available for emoji analysis.")
                
                # Sentiment Analysis Visualizations
                st.title("πŸ“ˆ Sentiment Analysis")
                
                # Convert month names to abbreviated format
                month_map = {
                    'January': 'Jan', 'February': 'Feb', 'March': 'Mar', 'April': 'Apr',
                    'May': 'May', 'June': 'Jun', 'July': 'Jul', 'August': 'Aug',
                    'September': 'Sep', 'October': 'Oct', 'November': 'Nov', 'December': 'Dec'
                }
                df_filtered['month'] = df_filtered['month'].map(month_map)

                # Group by month and sentiment
                monthly_sentiment = df_filtered.groupby(['month', 'sentiment']).size().unstack(fill_value=0)

                # Plotting: Histogram (Bar Chart) for each sentiment
                st.write("### Sentiment Count by Month (Histogram)")

                # Create a figure with subplots for each sentiment
                fig, axes = plt.subplots(1, 3, figsize=(18, 5))

                # Plot Positive Sentiment
                if 'positive' in monthly_sentiment:
                    axes[0].bar(monthly_sentiment.index, monthly_sentiment['positive'], color='green')
                axes[0].set_title('Positive Sentiment')
                axes[0].set_xlabel('Month')
                axes[0].set_ylabel('Count')

                # Plot Neutral Sentiment
                if 'neutral' in monthly_sentiment:
                    axes[1].bar(monthly_sentiment.index, monthly_sentiment['neutral'], color='blue')
                axes[1].set_title('Neutral Sentiment')
                axes[1].set_xlabel('Month')
                axes[1].set_ylabel('Count')

                # Plot Negative Sentiment
                if 'negative' in monthly_sentiment:
                    axes[2].bar(monthly_sentiment.index, monthly_sentiment['negative'], color='red')
                axes[2].set_title('Negative Sentiment')
                axes[2].set_xlabel('Month')
                axes[2].set_ylabel('Count')

                # Display the plots in Streamlit
                st.pyplot(fig)
                plt.clf()

                # Count sentiments per day of the week
                sentiment_counts = df_filtered.groupby(['day_of_week', 'sentiment']).size().unstack(fill_value=0)

                # Sort days correctly
                day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
                sentiment_counts = sentiment_counts.reindex(day_order)

                # Daily Sentiment Analysis
                st.write("### Daily Sentiment Analysis")

                # Create a Matplotlib figure
                fig, ax = plt.subplots(figsize=(10, 5))
                sentiment_counts.plot(kind='bar', stacked=False, ax=ax, color=['red', 'blue', 'green'])

                # Customize the plot
                ax.set_xlabel("Day of the Week")
                ax.set_ylabel("Count")
                ax.set_title("Sentiment Distribution per Day of the Week")
                ax.legend(title="Sentiment")

                # Display the plot in Streamlit
                st.pyplot(fig)
                plt.clf()

                # Count messages per user per sentiment (only for Overall view)
                if selected_user == 'Overall':
                    sentiment_counts = df_filtered.groupby(['user', 'sentiment']).size().reset_index(name='Count')

                    # Calculate total messages per sentiment
                    total_per_sentiment = df_filtered['sentiment'].value_counts().to_dict()

                    # Add percentage column
                    sentiment_counts['Percentage'] = sentiment_counts.apply(
                        lambda row: (row['Count'] / total_per_sentiment[row['sentiment']]) * 100, axis=1
                    )

                    # Separate tables for each sentiment
                    positive_df = sentiment_counts[sentiment_counts['sentiment'] == 'positive'].sort_values(by='Count', ascending=False).head(10)
                    neutral_df = sentiment_counts[sentiment_counts['sentiment'] == 'neutral'].sort_values(by='Count', ascending=False).head(10)
                    negative_df = sentiment_counts[sentiment_counts['sentiment'] == 'negative'].sort_values(by='Count', ascending=False).head(10)

                    # Sentiment Contribution Analysis
                    st.write("### Sentiment Contribution by User")

                    # Create three columns for side-by-side display
                    col1, col2, col3 = st.columns(3)

                    # Display Positive Table
                    with col1:
                        st.subheader("Top Positive Contributors")
                        if not positive_df.empty:
                            st.dataframe(positive_df[['user', 'Count', 'Percentage']])
                        else:
                            st.warning("No positive sentiment data")

                    # Display Neutral Table
                    with col2:
                        st.subheader("Top Neutral Contributors")
                        if not neutral_df.empty:
                            st.dataframe(neutral_df[['user', 'Count', 'Percentage']])
                        else:
                            st.warning("No neutral sentiment data")

                    # Display Negative Table
                    with col3:
                        st.subheader("Top Negative Contributors")
                        if not negative_df.empty:
                            st.dataframe(negative_df[['user', 'Count', 'Percentage']])
                        else:
                            st.warning("No negative sentiment data")

                             # Topic Analysis Section
                st.title("πŸ” Area of Focus: Topic Analysis")
                
                # Check if topic column exists, otherwise perform topic modeling
                # if 'topic' not in df_filtered.columns:
                #     with st.spinner("Performing topic modeling..."):
                #         try:
                #             # Add topic modeling here or ensure your helper functions handle it
                #             df_filtered = helper.perform_topic_modeling(df_filtered)
                #         except Exception as e:
                #             st.error(f"Topic modeling failed: {str(e)}")
                #             st.stop()
                
                # Plot Topic Distribution
                st.header("Topic Distribution")
                try:
                    fig = helper.plot_topic_distribution(df_filtered)
                    st.pyplot(fig)
                    plt.clf()
                except Exception as e:
                    st.warning(f"Could not display topic distribution: {str(e)}")

                # Display Sample Messages for Each Topic
                st.header("Sample Messages for Each Topic")
                if 'topic' in df_filtered.columns:
                    for topic_id in sorted(df_filtered['topic'].unique()):
                        st.subheader(f"Topic {topic_id}")
                        
                        # Get messages for the current topic
                        filtered_messages = df_filtered[df_filtered['topic'] == topic_id]['message']
                        
                        # Determine sample size
                        sample_size = min(5, len(filtered_messages))
                        
                        if sample_size > 0:
                            sample_messages = filtered_messages.sample(sample_size, replace=False).tolist()
                            for msg in sample_messages:
                                st.write(f"- {msg}")
                        else:
                            st.write("No messages available for this topic.")
                else:
                    st.warning("Topic information not available")

                # Topic Distribution Over Time
                st.header("πŸ“… Topic Trends Over Time")
                
                # Add time frequency selector
                time_freq = st.selectbox("Select Time Frequency", ["Daily", "Weekly", "Monthly"], key='time_freq')
                
                # Plot topic trends
                try:
                    freq_map = {"Daily": "D", "Weekly": "W", "Monthly": "M"}
                    topic_distribution = helper.topic_distribution_over_time(df_filtered, time_freq=freq_map[time_freq])
                    
                    # Choose between static and interactive plot
                    use_plotly = st.checkbox("Use interactive visualization", value=True, key='use_plotly')
                    
                    if use_plotly:
                        fig = helper.plot_topic_distribution_over_time_plotly(topic_distribution)
                        st.plotly_chart(fig, use_container_width=True)
                    else:
                        fig = helper.plot_topic_distribution_over_time(topic_distribution)
                        st.pyplot(fig)
                        plt.clf()
                except Exception as e:
                    st.warning(f"Could not display topic trends: {str(e)}")

                # Clustering Analysis Section
                st.title("🧩 Conversation Clusters")
                
                # Number of clusters input
                n_clusters = st.slider("Select number of clusters", 
                                       min_value=2, 
                                       max_value=10, 
                                       value=5,
                                       key='n_clusters')
                
                # Perform clustering
                with st.spinner("Analyzing conversation clusters..."):
                    try:
                        df_clustered, reduced_features, _ = preprocessor.preprocess_for_clustering(df_filtered, n_clusters=n_clusters)
                        
                        # Plot clusters
                        st.header("Cluster Visualization")
                        fig = helper.plot_clusters(reduced_features, df_clustered['cluster'])
                        st.pyplot(fig)
                        plt.clf()
                        
                        # Cluster Insights
                        st.header("πŸ“Œ Cluster Insights")
                        
                        # 1. Dominant Conversation Themes
                        st.subheader("1. Dominant Themes")
                        cluster_labels = helper.get_cluster_labels(df_clustered, n_clusters)
                        for cluster_id, label in cluster_labels.items():
                            st.write(f"**Cluster {cluster_id}**: {label}")
                        
                        # 2. Temporal Patterns
                        st.subheader("2. Temporal Patterns")
                        temporal_trends = helper.get_temporal_trends(df_clustered)
                        for cluster_id, trend in temporal_trends.items():
                            st.write(f"**Cluster {cluster_id}**: Peaks on {trend['peak_day']} around {trend['peak_time']}")
                        
                        # 3. User Contributions
                        if selected_user == 'Overall':
                            st.subheader("3. Top Contributors")
                            user_contributions = helper.get_user_contributions(df_clustered)
                            for cluster_id, users in user_contributions.items():
                                st.write(f"**Cluster {cluster_id}**: {', '.join(users[:3])}...")
                        
                        # 4. Sentiment by Cluster
                        st.subheader("4. Sentiment Analysis")
                        sentiment_by_cluster = helper.get_sentiment_by_cluster(df_clustered)
                        for cluster_id, sentiment in sentiment_by_cluster.items():
                            st.write(f"**Cluster {cluster_id}**: {sentiment['positive']}% positive, {sentiment['neutral']}% neutral, {sentiment['negative']}% negative")
                        
                        # Sample messages from each cluster
                        st.subheader("Sample Messages")
                        for cluster_id in sorted(df_clustered['cluster'].unique()):
                            with st.expander(f"Cluster {cluster_id} Messages"):
                                cluster_msgs = df_clustered[df_clustered['cluster'] == cluster_id]['message']
                                sample_size = min(3, len(cluster_msgs))
                                if sample_size > 0:
                                    for msg in cluster_msgs.sample(sample_size, replace=False):
                                        st.write(f"- {msg}")
                                else:
                                    st.write("No messages available")
                        
                    except Exception as e:
                        st.error(f"Clustering failed: {str(e)}")