Files changed (5) hide show
  1. app.py +438 -0
  2. helper.py +323 -0
  3. preprocessor.py +199 -0
  4. requirements.txt +23 -0
  5. sentiment.py +98 -0
app.py CHANGED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ st.set_page_config(page_title="WhatsApp Chat Analyzer", layout="wide")
3
+
4
+ import pandas as pd
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+ import preprocessor, helper
8
+ from sentiment import predict_sentiment_batch
9
+ import os
10
+ os.environ["STREAMLIT_SERVER_RUN_ON_SAVE"] = "false"
11
+
12
+ # Theme customization
13
+ st.markdown(
14
+ """
15
+ <style>
16
+ .main {background-color: #f0f2f6;}
17
+ </style>
18
+ """,
19
+ unsafe_allow_html=True
20
+ )
21
+
22
+ # Set seaborn style
23
+ sns.set_theme(style="whitegrid")
24
+
25
+ st.title("📊 WhatsApp Chat Sentiment Analysis Dashboard")
26
+ st.subheader('Instructions')
27
+ st.markdown("1. Open the sidebar and upload your WhatsApp chat file in .txt format.")
28
+ st.markdown("2. Wait for the initial processing (minimal delay).")
29
+ st.markdown("3. Customize the analysis by selecting users or filters.")
30
+ st.markdown("4. Click 'Show Analysis' for detailed results.")
31
+
32
+ st.sidebar.title("Whatsapp Chat Analyzer")
33
+ uploaded_file = st.sidebar.file_uploader("Upload your chat file (.txt)", type="txt")
34
+
35
+ @st.cache_data
36
+ def load_and_preprocess(file_content):
37
+ return preprocessor.preprocess(file_content)
38
+
39
+ if uploaded_file is not None:
40
+ raw_data = uploaded_file.read().decode("utf-8")
41
+ with st.spinner("Loading chat data..."):
42
+ df, _ = load_and_preprocess(raw_data)
43
+ st.session_state.df = df
44
+
45
+ st.sidebar.header("🔍 Filters")
46
+ user_list = ["Overall"] + sorted(df["user"].unique().tolist())
47
+ selected_user = st.sidebar.selectbox("Select User", user_list)
48
+
49
+ df_filtered = df if selected_user == "Overall" else df[df["user"] == selected_user]
50
+
51
+ if st.sidebar.button("Show Analysis"):
52
+ if df_filtered.empty:
53
+ st.warning(f"No data found for user: {selected_user}")
54
+ else:
55
+ with st.spinner("Analyzing..."):
56
+ if 'sentiment' not in df_filtered.columns:
57
+ try:
58
+ print("Starting sentiment analysis...")
59
+ # Get messages as clean strings
60
+ message_list = df_filtered["message"].astype(str).tolist()
61
+ message_list = [msg for msg in message_list if msg.strip()]
62
+
63
+ print(f"Processing {len(message_list)} messages")
64
+ print(f"Sample messages: {message_list[:5]}")
65
+
66
+ # Directly call the sentiment analysis function
67
+ df_filtered['sentiment'] = predict_sentiment_batch(message_list)
68
+ print("Sentiment analysis completed successfully")
69
+
70
+ except Exception as e:
71
+ st.error(f"Sentiment analysis failed: {str(e)}")
72
+ print(f"Full error: {str(e)}")
73
+
74
+ st.session_state.df_filtered = df_filtered
75
+ else:
76
+ st.session_state.df_filtered = df_filtered
77
+
78
+ # Display statistics and visualizations
79
+ num_messages, words, num_media, num_links = helper.fetch_stats(selected_user, df_filtered)
80
+ st.title("Top Statistics")
81
+ col1, col2, col3, col4 = st.columns(4)
82
+ with col1:
83
+ st.header("Total Messages")
84
+ st.title(num_messages)
85
+ with col2:
86
+ st.header("Total Words")
87
+ st.title(words)
88
+ with col3:
89
+ st.header("Media Shared")
90
+ st.title(num_media)
91
+ with col4:
92
+ st.header("Links Shared")
93
+ st.title(num_links)
94
+
95
+ st.title("Monthly Timeline")
96
+ timeline = helper.monthly_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered))))
97
+ if not timeline.empty:
98
+ plt.figure(figsize=(10, 5))
99
+ sns.lineplot(data=timeline, x='time', y='message', color='green')
100
+ plt.title("Monthly Timeline")
101
+ plt.xlabel("Date")
102
+ plt.ylabel("Messages")
103
+ st.pyplot(plt)
104
+ plt.clf()
105
+
106
+ st.title("Daily Timeline")
107
+ daily_timeline = helper.daily_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered))))
108
+ if not daily_timeline.empty:
109
+ plt.figure(figsize=(10, 5))
110
+ sns.lineplot(data=daily_timeline, x='date', y='message', color='black')
111
+ plt.title("Daily Timeline")
112
+ plt.xlabel("Date")
113
+ plt.ylabel("Messages")
114
+ st.pyplot(plt)
115
+ plt.clf()
116
+
117
+ st.title("Activity Map")
118
+ col1, col2 = st.columns(2)
119
+ with col1:
120
+ st.header("Most Busy Day")
121
+ busy_day = helper.week_activity_map(selected_user, df_filtered)
122
+ if not busy_day.empty:
123
+ plt.figure(figsize=(10, 5))
124
+ sns.barplot(x=busy_day.index, y=busy_day.values, palette="Purples_r")
125
+ plt.title("Most Busy Day")
126
+ plt.xlabel("Day of Week")
127
+ plt.ylabel("Message Count")
128
+ st.pyplot(plt)
129
+ plt.clf()
130
+ with col2:
131
+ st.header("Most Busy Month")
132
+ busy_month = helper.month_activity_map(selected_user, df_filtered)
133
+ if not busy_month.empty:
134
+ plt.figure(figsize=(10, 5))
135
+ sns.barplot(x=busy_month.index, y=busy_month.values, palette="Oranges_r")
136
+ plt.title("Most Busy Month")
137
+ plt.xlabel("Month")
138
+ plt.ylabel("Message Count")
139
+ st.pyplot(plt)
140
+ plt.clf()
141
+
142
+ if selected_user == 'Overall':
143
+ st.title("Most Busy Users")
144
+ x, new_df = helper.most_busy_users(df_filtered)
145
+ if not x.empty:
146
+ plt.figure(figsize=(10, 5))
147
+ sns.barplot(x=x.index, y=x.values, palette="Reds_r")
148
+ plt.title("Most Busy Users")
149
+ plt.xlabel("User")
150
+ plt.ylabel("Message Count")
151
+ plt.xticks(rotation=45)
152
+ st.pyplot(plt)
153
+ st.title("Word Count by User")
154
+ plt.clf()
155
+ st.dataframe(new_df)
156
+
157
+ # Most common words analysis
158
+ st.title("Most Common Words")
159
+ most_common_df = helper.most_common_words(selected_user, df_filtered)
160
+ if not most_common_df.empty:
161
+ fig, ax = plt.subplots(figsize=(10, 6))
162
+ sns.barplot(y=most_common_df[0], x=most_common_df[1], ax=ax, palette="Blues_r")
163
+ ax.set_title("Top 20 Most Common Words")
164
+ ax.set_xlabel("Frequency")
165
+ ax.set_ylabel("Words")
166
+ plt.xticks(rotation='vertical')
167
+ st.pyplot(fig)
168
+ plt.clf()
169
+ else:
170
+ st.warning("No data available for most common words.")
171
+
172
+ # Emoji analysis
173
+ st.title("Emoji Analysis")
174
+ emoji_df = helper.emoji_helper(selected_user, df_filtered)
175
+ if not emoji_df.empty:
176
+ col1, col2 = st.columns(2)
177
+
178
+ with col1:
179
+ st.subheader("Top Emojis Used")
180
+ st.dataframe(emoji_df)
181
+
182
+ with col2:
183
+ fig, ax = plt.subplots(figsize=(8, 8))
184
+ ax.pie(emoji_df[1].head(), labels=emoji_df[0].head(),
185
+ autopct="%0.2f%%", startangle=90,
186
+ colors=sns.color_palette("pastel"))
187
+ ax.set_title("Top Emoji Distribution")
188
+ st.pyplot(fig)
189
+ plt.clf()
190
+ else:
191
+ st.warning("No data available for emoji analysis.")
192
+
193
+ # Sentiment Analysis Visualizations
194
+ st.title("📈 Sentiment Analysis")
195
+
196
+ # Convert month names to abbreviated format
197
+ month_map = {
198
+ 'January': 'Jan', 'February': 'Feb', 'March': 'Mar', 'April': 'Apr',
199
+ 'May': 'May', 'June': 'Jun', 'July': 'Jul', 'August': 'Aug',
200
+ 'September': 'Sep', 'October': 'Oct', 'November': 'Nov', 'December': 'Dec'
201
+ }
202
+ df_filtered['month'] = df_filtered['month'].map(month_map)
203
+
204
+ # Group by month and sentiment
205
+ monthly_sentiment = df_filtered.groupby(['month', 'sentiment']).size().unstack(fill_value=0)
206
+
207
+ # Plotting: Histogram (Bar Chart) for each sentiment
208
+ st.write("### Sentiment Count by Month (Histogram)")
209
+
210
+ # Create a figure with subplots for each sentiment
211
+ fig, axes = plt.subplots(1, 3, figsize=(18, 5))
212
+
213
+ # Plot Positive Sentiment
214
+ if 'positive' in monthly_sentiment:
215
+ axes[0].bar(monthly_sentiment.index, monthly_sentiment['positive'], color='green')
216
+ axes[0].set_title('Positive Sentiment')
217
+ axes[0].set_xlabel('Month')
218
+ axes[0].set_ylabel('Count')
219
+
220
+ # Plot Neutral Sentiment
221
+ if 'neutral' in monthly_sentiment:
222
+ axes[1].bar(monthly_sentiment.index, monthly_sentiment['neutral'], color='blue')
223
+ axes[1].set_title('Neutral Sentiment')
224
+ axes[1].set_xlabel('Month')
225
+ axes[1].set_ylabel('Count')
226
+
227
+ # Plot Negative Sentiment
228
+ if 'negative' in monthly_sentiment:
229
+ axes[2].bar(monthly_sentiment.index, monthly_sentiment['negative'], color='red')
230
+ axes[2].set_title('Negative Sentiment')
231
+ axes[2].set_xlabel('Month')
232
+ axes[2].set_ylabel('Count')
233
+
234
+ # Display the plots in Streamlit
235
+ st.pyplot(fig)
236
+ plt.clf()
237
+
238
+ # Count sentiments per day of the week
239
+ sentiment_counts = df_filtered.groupby(['day_of_week', 'sentiment']).size().unstack(fill_value=0)
240
+
241
+ # Sort days correctly
242
+ day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
243
+ sentiment_counts = sentiment_counts.reindex(day_order)
244
+
245
+ # Daily Sentiment Analysis
246
+ st.write("### Daily Sentiment Analysis")
247
+
248
+ # Create a Matplotlib figure
249
+ fig, ax = plt.subplots(figsize=(10, 5))
250
+ sentiment_counts.plot(kind='bar', stacked=False, ax=ax, color=['red', 'blue', 'green'])
251
+
252
+ # Customize the plot
253
+ ax.set_xlabel("Day of the Week")
254
+ ax.set_ylabel("Count")
255
+ ax.set_title("Sentiment Distribution per Day of the Week")
256
+ ax.legend(title="Sentiment")
257
+
258
+ # Display the plot in Streamlit
259
+ st.pyplot(fig)
260
+ plt.clf()
261
+
262
+ # Count messages per user per sentiment (only for Overall view)
263
+ if selected_user == 'Overall':
264
+ sentiment_counts = df_filtered.groupby(['user', 'sentiment']).size().reset_index(name='Count')
265
+
266
+ # Calculate total messages per sentiment
267
+ total_per_sentiment = df_filtered['sentiment'].value_counts().to_dict()
268
+
269
+ # Add percentage column
270
+ sentiment_counts['Percentage'] = sentiment_counts.apply(
271
+ lambda row: (row['Count'] / total_per_sentiment[row['sentiment']]) * 100, axis=1
272
+ )
273
+
274
+ # Separate tables for each sentiment
275
+ positive_df = sentiment_counts[sentiment_counts['sentiment'] == 'positive'].sort_values(by='Count', ascending=False).head(10)
276
+ neutral_df = sentiment_counts[sentiment_counts['sentiment'] == 'neutral'].sort_values(by='Count', ascending=False).head(10)
277
+ negative_df = sentiment_counts[sentiment_counts['sentiment'] == 'negative'].sort_values(by='Count', ascending=False).head(10)
278
+
279
+ # Sentiment Contribution Analysis
280
+ st.write("### Sentiment Contribution by User")
281
+
282
+ # Create three columns for side-by-side display
283
+ col1, col2, col3 = st.columns(3)
284
+
285
+ # Display Positive Table
286
+ with col1:
287
+ st.subheader("Top Positive Contributors")
288
+ if not positive_df.empty:
289
+ st.dataframe(positive_df[['user', 'Count', 'Percentage']])
290
+ else:
291
+ st.warning("No positive sentiment data")
292
+
293
+ # Display Neutral Table
294
+ with col2:
295
+ st.subheader("Top Neutral Contributors")
296
+ if not neutral_df.empty:
297
+ st.dataframe(neutral_df[['user', 'Count', 'Percentage']])
298
+ else:
299
+ st.warning("No neutral sentiment data")
300
+
301
+ # Display Negative Table
302
+ with col3:
303
+ st.subheader("Top Negative Contributors")
304
+ if not negative_df.empty:
305
+ st.dataframe(negative_df[['user', 'Count', 'Percentage']])
306
+ else:
307
+ st.warning("No negative sentiment data")
308
+
309
+ # Topic Analysis Section
310
+ st.title("🔍 Area of Focus: Topic Analysis")
311
+
312
+ # Check if topic column exists, otherwise perform topic modeling
313
+ # if 'topic' not in df_filtered.columns:
314
+ # with st.spinner("Performing topic modeling..."):
315
+ # try:
316
+ # # Add topic modeling here or ensure your helper functions handle it
317
+ # df_filtered = helper.perform_topic_modeling(df_filtered)
318
+ # except Exception as e:
319
+ # st.error(f"Topic modeling failed: {str(e)}")
320
+ # st.stop()
321
+
322
+ # Plot Topic Distribution
323
+ st.header("Topic Distribution")
324
+ try:
325
+ fig = helper.plot_topic_distribution(df_filtered)
326
+ st.pyplot(fig)
327
+ plt.clf()
328
+ except Exception as e:
329
+ st.warning(f"Could not display topic distribution: {str(e)}")
330
+
331
+ # Display Sample Messages for Each Topic
332
+ st.header("Sample Messages for Each Topic")
333
+ if 'topic' in df_filtered.columns:
334
+ for topic_id in sorted(df_filtered['topic'].unique()):
335
+ st.subheader(f"Topic {topic_id}")
336
+
337
+ # Get messages for the current topic
338
+ filtered_messages = df_filtered[df_filtered['topic'] == topic_id]['message']
339
+
340
+ # Determine sample size
341
+ sample_size = min(5, len(filtered_messages))
342
+
343
+ if sample_size > 0:
344
+ sample_messages = filtered_messages.sample(sample_size, replace=False).tolist()
345
+ for msg in sample_messages:
346
+ st.write(f"- {msg}")
347
+ else:
348
+ st.write("No messages available for this topic.")
349
+ else:
350
+ st.warning("Topic information not available")
351
+
352
+ # Topic Distribution Over Time
353
+ st.header("📅 Topic Trends Over Time")
354
+
355
+ # Add time frequency selector
356
+ time_freq = st.selectbox("Select Time Frequency", ["Daily", "Weekly", "Monthly"], key='time_freq')
357
+
358
+ # Plot topic trends
359
+ try:
360
+ freq_map = {"Daily": "D", "Weekly": "W", "Monthly": "M"}
361
+ topic_distribution = helper.topic_distribution_over_time(df_filtered, time_freq=freq_map[time_freq])
362
+
363
+ # Choose between static and interactive plot
364
+ use_plotly = st.checkbox("Use interactive visualization", value=True, key='use_plotly')
365
+
366
+ if use_plotly:
367
+ fig = helper.plot_topic_distribution_over_time_plotly(topic_distribution)
368
+ st.plotly_chart(fig, use_container_width=True)
369
+ else:
370
+ fig = helper.plot_topic_distribution_over_time(topic_distribution)
371
+ st.pyplot(fig)
372
+ plt.clf()
373
+ except Exception as e:
374
+ st.warning(f"Could not display topic trends: {str(e)}")
375
+
376
+ # Clustering Analysis Section
377
+ st.title("🧩 Conversation Clusters")
378
+
379
+ # Number of clusters input
380
+ n_clusters = st.slider("Select number of clusters",
381
+ min_value=2,
382
+ max_value=10,
383
+ value=5,
384
+ key='n_clusters')
385
+
386
+ # Perform clustering
387
+ with st.spinner("Analyzing conversation clusters..."):
388
+ try:
389
+ df_clustered, reduced_features, _ = preprocessor.preprocess_for_clustering(df_filtered, n_clusters=n_clusters)
390
+
391
+ # Plot clusters
392
+ st.header("Cluster Visualization")
393
+ fig = helper.plot_clusters(reduced_features, df_clustered['cluster'])
394
+ st.pyplot(fig)
395
+ plt.clf()
396
+
397
+ # Cluster Insights
398
+ st.header("📌 Cluster Insights")
399
+
400
+ # 1. Dominant Conversation Themes
401
+ st.subheader("1. Dominant Themes")
402
+ cluster_labels = helper.get_cluster_labels(df_clustered, n_clusters)
403
+ for cluster_id, label in cluster_labels.items():
404
+ st.write(f"**Cluster {cluster_id}**: {label}")
405
+
406
+ # 2. Temporal Patterns
407
+ st.subheader("2. Temporal Patterns")
408
+ temporal_trends = helper.get_temporal_trends(df_clustered)
409
+ for cluster_id, trend in temporal_trends.items():
410
+ st.write(f"**Cluster {cluster_id}**: Peaks on {trend['peak_day']} around {trend['peak_time']}")
411
+
412
+ # 3. User Contributions
413
+ if selected_user == 'Overall':
414
+ st.subheader("3. Top Contributors")
415
+ user_contributions = helper.get_user_contributions(df_clustered)
416
+ for cluster_id, users in user_contributions.items():
417
+ st.write(f"**Cluster {cluster_id}**: {', '.join(users[:3])}...")
418
+
419
+ # 4. Sentiment by Cluster
420
+ st.subheader("4. Sentiment Analysis")
421
+ sentiment_by_cluster = helper.get_sentiment_by_cluster(df_clustered)
422
+ for cluster_id, sentiment in sentiment_by_cluster.items():
423
+ st.write(f"**Cluster {cluster_id}**: {sentiment['positive']}% positive, {sentiment['neutral']}% neutral, {sentiment['negative']}% negative")
424
+
425
+ # Sample messages from each cluster
426
+ st.subheader("Sample Messages")
427
+ for cluster_id in sorted(df_clustered['cluster'].unique()):
428
+ with st.expander(f"Cluster {cluster_id} Messages"):
429
+ cluster_msgs = df_clustered[df_clustered['cluster'] == cluster_id]['message']
430
+ sample_size = min(3, len(cluster_msgs))
431
+ if sample_size > 0:
432
+ for msg in cluster_msgs.sample(sample_size, replace=False):
433
+ st.write(f"- {msg}")
434
+ else:
435
+ st.write("No messages available")
436
+
437
+ except Exception as e:
438
+ st.error(f"Clustering failed: {str(e)}")
helper.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from urlextract import URLExtract
2
+ from wordcloud import WordCloud
3
+ import pandas as pd
4
+ from collections import Counter
5
+ import emoji
6
+ import plotly.express as px
7
+ import matplotlib.pyplot as plt
8
+ import seaborn as sns
9
+
10
+ extract = URLExtract()
11
+
12
+ def fetch_stats(selected_user, df):
13
+ if selected_user != 'Overall':
14
+ df = df[df['user'] == selected_user]
15
+ num_messages = df.shape[0]
16
+ words = sum(len(msg.split()) for msg in df['message'])
17
+ num_media_messages = df[df['unfiltered_messages'] == '<media omitted>\n'].shape[0]
18
+ links = sum(len(extract.find_urls(msg)) for msg in df['unfiltered_messages'])
19
+ return num_messages, words, num_media_messages, links
20
+
21
+ def most_busy_users(df):
22
+ x = df['user'].value_counts().head()
23
+ df = round((df['user'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename(
24
+ columns={'index': 'percentage', 'user': 'Name'})
25
+ return x, df
26
+
27
+ def create_wordcloud(selected_user, df):
28
+ if selected_user != 'Overall':
29
+ df = df[df['user'] == selected_user]
30
+ temp = df[df['user'] != 'group_notification']
31
+ temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')]
32
+ wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white')
33
+ df_wc = wc.generate(temp['message'].str.cat(sep=" "))
34
+ return df_wc
35
+
36
+ def most_common_words(selected_user, df):
37
+ if selected_user != 'Overall':
38
+ df = df[df['user'] == selected_user]
39
+ temp = df[df['user'] != 'group_notification']
40
+ temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')]
41
+ words = [word for msg in temp['message'] for word in msg.lower().split()]
42
+ return pd.DataFrame(Counter(words).most_common(20))
43
+
44
+ def emoji_helper(selected_user, df):
45
+ if selected_user != 'Overall':
46
+ df = df[df['user'] == selected_user]
47
+ emojis = [c for msg in df['unfiltered_messages'] for c in msg if c in emoji.EMOJI_DATA]
48
+ return pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
49
+
50
+ def monthly_timeline(selected_user, df):
51
+ if selected_user != 'Overall':
52
+ df = df[df['user'] == selected_user]
53
+ timeline = df.groupby(['year', 'month']).count()['message'].reset_index()
54
+ timeline['time'] = timeline['month'] + "-" + timeline['year'].astype(str)
55
+ return timeline
56
+
57
+ def daily_timeline(selected_user, df):
58
+ if selected_user != 'Overall':
59
+ df = df[df['user'] == selected_user]
60
+ return df.groupby('date').count()['message'].reset_index()
61
+
62
+ def week_activity_map(selected_user, df):
63
+ if selected_user != 'Overall':
64
+ df = df[df['user'] == selected_user]
65
+ return df['day_of_week'].value_counts()
66
+
67
+ def month_activity_map(selected_user, df):
68
+ if selected_user != 'Overall':
69
+ df = df[df['user'] == selected_user]
70
+ return df['month'].value_counts()
71
+
72
+ def plot_topic_distribution(df):
73
+ topic_counts = df['topic'].value_counts().sort_index()
74
+ fig = px.bar(x=topic_counts.index, y=topic_counts.values, title="Topic Distribution", color_discrete_sequence=['viridis'])
75
+ return fig
76
+
77
+ def topic_distribution_over_time(df, time_freq='M'):
78
+ df['time_period'] = df['date'].dt.to_period(time_freq)
79
+ return df.groupby(['time_period', 'topic']).size().unstack(fill_value=0)
80
+
81
+ def plot_topic_distribution_over_time_plotly(topic_distribution):
82
+ topic_distribution = topic_distribution.reset_index()
83
+ topic_distribution['time_period'] = topic_distribution['time_period'].dt.to_timestamp()
84
+ topic_distribution = topic_distribution.melt(id_vars='time_period', var_name='topic', value_name='count')
85
+ fig = px.line(topic_distribution, x='time_period', y='count', color='topic', title="Topic Distribution Over Time")
86
+ fig.update_layout(legend_title_text='Topics', xaxis_tickangle=-45)
87
+ return fig
88
+
89
+ def plot_clusters(reduced_features, clusters):
90
+ fig = px.scatter(x=reduced_features[:, 0], y=reduced_features[:, 1], color=clusters, title="Message Clusters (t-SNE)")
91
+ return fig
92
+ def most_common_words(selected_user, df):
93
+ # f = open('stop_hinglish.txt','r')
94
+ stop_words = df
95
+
96
+ if selected_user != 'Overall':
97
+ df = df[df['user'] == selected_user]
98
+
99
+ temp = df[df['user'] != 'group_notification']
100
+ temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')]
101
+
102
+ words = []
103
+
104
+ for message in temp['message']:
105
+ for word in message.lower().split():
106
+ if word not in stop_words:
107
+ words.append(word)
108
+
109
+ most_common_df = pd.DataFrame(Counter(words).most_common(20))
110
+ return most_common_df
111
+
112
+ def emoji_helper(selected_user, df):
113
+ if selected_user != 'Overall':
114
+ df = df[df['user'] == selected_user]
115
+
116
+ emojis = []
117
+ for message in df['unfiltered_messages']:
118
+ emojis.extend([c for c in message if c in emoji.EMOJI_DATA])
119
+
120
+ emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
121
+
122
+ return emoji_df
123
+ def plot_topic_distribution(df):
124
+ """
125
+ Plots the distribution of topics in the chat data.
126
+ """
127
+ topic_counts = df['topic'].value_counts().sort_index()
128
+ fig, ax = plt.subplots()
129
+ sns.barplot(x=topic_counts.index, y=topic_counts.values, ax=ax, palette="viridis")
130
+ ax.set_title("Topic Distribution")
131
+ ax.set_xlabel("Topic")
132
+ ax.set_ylabel("Number of Messages")
133
+ return fig
134
+
135
+ def most_frequent_keywords(messages, top_n=10):
136
+ """
137
+ Extracts the most frequent keywords from a list of messages.
138
+ """
139
+ words = [word for msg in messages for word in msg.split()]
140
+ word_freq = Counter(words)
141
+ return word_freq.most_common(top_n)
142
+ def plot_topic_distribution_over_time(topic_distribution):
143
+ """
144
+ Plots the distribution of topics over time using a line chart.
145
+ """
146
+ fig, ax = plt.subplots(figsize=(12, 6))
147
+
148
+ # Plot each topic as a separate line
149
+ for topic in topic_distribution.columns:
150
+ ax.plot(topic_distribution.index.to_timestamp(), topic_distribution[topic], label=f"Topic {topic}")
151
+
152
+ ax.set_title("Topic Distribution Over Time")
153
+ ax.set_xlabel("Time Period")
154
+ ax.set_ylabel("Number of Messages")
155
+ ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left')
156
+ plt.xticks(rotation=45)
157
+ plt.tight_layout()
158
+ return fig
159
+
160
+ def plot_most_frequent_keywords(keywords):
161
+ """
162
+ Plots the most frequent keywords.
163
+ """
164
+ words, counts = zip(*keywords)
165
+ fig, ax = plt.subplots()
166
+ sns.barplot(x=list(counts), y=list(words), ax=ax, palette="viridis")
167
+ ax.set_title("Most Frequent Keywords")
168
+ ax.set_xlabel("Frequency")
169
+ ax.set_ylabel("Keyword")
170
+ return fig
171
+ def topic_distribution_over_time(df, time_freq='M'):
172
+ """
173
+ Analyzes the distribution of topics over time.
174
+ """
175
+ # Group by time interval and topic
176
+ df['time_period'] = df['date'].dt.to_period(time_freq)
177
+ topic_distribution = df.groupby(['time_period', 'topic']).size().unstack(fill_value=0)
178
+ return topic_distribution
179
+
180
+ def plot_topic_distribution_over_time(topic_distribution):
181
+ """
182
+ Plots the distribution of topics over time using a line chart.
183
+ """
184
+ fig, ax = plt.subplots(figsize=(12, 6))
185
+
186
+ # Plot each topic as a separate line
187
+ for topic in topic_distribution.columns:
188
+ ax.plot(topic_distribution.index.to_timestamp(), topic_distribution[topic], label=f"Topic {topic}")
189
+
190
+ ax.set_title("Topic Distribution Over Time")
191
+ ax.set_xlabel("Time Period")
192
+ ax.set_ylabel("Number of Messages")
193
+ ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left')
194
+ plt.xticks(rotation=45)
195
+ plt.tight_layout()
196
+ return fig
197
+
198
+ def plot_topic_distribution_over_time_plotly(topic_distribution):
199
+ """
200
+ Plots the distribution of topics over time using Plotly.
201
+ """
202
+ topic_distribution = topic_distribution.reset_index()
203
+ topic_distribution['time_period'] = topic_distribution['time_period'].dt.to_timestamp()
204
+ topic_distribution = topic_distribution.melt(id_vars='time_period', var_name='topic', value_name='count')
205
+
206
+ fig = px.line(topic_distribution, x='time_period', y='count', color='topic',
207
+ title="Topic Distribution Over Time", labels={'time_period': 'Time Period', 'count': 'Number of Messages'})
208
+ fig.update_layout(legend_title_text='Topics', xaxis_tickangle=-45)
209
+ return fig
210
+ def plot_clusters(reduced_features, clusters):
211
+ """
212
+ Visualize clusters using t-SNE.
213
+ Args:
214
+ reduced_features (np.array): 2D array of reduced features.
215
+ clusters (np.array): Cluster labels.
216
+ Returns:
217
+ fig (plt.Figure): Matplotlib figure object.
218
+ """
219
+ plt.figure(figsize=(10, 8))
220
+ sns.scatterplot(
221
+ x=reduced_features[:, 0],
222
+ y=reduced_features[:, 1],
223
+ hue=clusters,
224
+ palette="viridis",
225
+ legend="full"
226
+ )
227
+ plt.title("Message Clusters (t-SNE Visualization)")
228
+ plt.xlabel("t-SNE Component 1")
229
+ plt.ylabel("t-SNE Component 2")
230
+ plt.tight_layout()
231
+ return plt.gcf()
232
+ def get_cluster_labels(df, n_clusters):
233
+ """
234
+ Generate descriptive labels for each cluster based on top keywords.
235
+ """
236
+ from sklearn.feature_extraction.text import TfidfVectorizer
237
+ import numpy as np
238
+
239
+ vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
240
+ tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message'])
241
+
242
+ cluster_labels = {}
243
+ for cluster_id in range(n_clusters):
244
+ cluster_indices = df[df['cluster'] == cluster_id].index
245
+ if len(cluster_indices) > 0:
246
+ cluster_tfidf = tfidf_matrix[cluster_indices]
247
+ top_keywords = np.argsort(cluster_tfidf.sum(axis=0).A1)[-3:][::-1]
248
+ cluster_labels[cluster_id] = ", ".join(vectorizer.get_feature_names_out()[top_keywords])
249
+ else:
250
+ cluster_labels[cluster_id] = "No dominant theme"
251
+ return cluster_labels
252
+
253
+ def get_temporal_trends(df):
254
+ """
255
+ Analyze temporal trends for each cluster (peak day and time).
256
+ """
257
+ temporal_trends = {}
258
+ for cluster_id in df['cluster'].unique():
259
+ cluster_data = df[df['cluster'] == cluster_id]
260
+ if not cluster_data.empty:
261
+ peak_day = cluster_data['day_of_week'].mode()[0]
262
+ peak_time = cluster_data['hour'].mode()[0]
263
+ temporal_trends[cluster_id] = {"peak_day": peak_day, "peak_time": f"{peak_time}:00"}
264
+ return temporal_trends
265
+
266
+ def get_user_contributions(df):
267
+ """
268
+ Identify top contributors for each cluster.
269
+ """
270
+ user_contributions = {}
271
+ for cluster_id in df['cluster'].unique():
272
+ cluster_data = df[df['cluster'] == cluster_id]
273
+ if not cluster_data.empty:
274
+ top_users = cluster_data['user'].value_counts().head(3).index.tolist()
275
+ user_contributions[cluster_id] = top_users
276
+ return user_contributions
277
+
278
+ def get_sentiment_by_cluster(df):
279
+ """
280
+ Analyze sentiment distribution for each cluster.
281
+ """
282
+ sentiment_by_cluster = {}
283
+ for cluster_id in df['cluster'].unique():
284
+ cluster_data = df[df['cluster'] == cluster_id]
285
+ if not cluster_data.empty:
286
+ sentiment_counts = cluster_data['sentiment'].value_counts(normalize=True) * 100
287
+ sentiment_by_cluster[cluster_id] = {
288
+ "positive": round(sentiment_counts.get('positive', 0)),
289
+ "neutral": round(sentiment_counts.get('neutral', 0)),
290
+ "negative": round(sentiment_counts.get('negative', 0))
291
+ }
292
+ return sentiment_by_cluster
293
+
294
+ def detect_anomalies(df):
295
+ """
296
+ Detect anomalies in each cluster (e.g., high link or media share).
297
+ """
298
+ anomalies = {}
299
+ for cluster_id in df['cluster'].unique():
300
+ cluster_data = df[df['cluster'] == cluster_id]
301
+ if not cluster_data.empty:
302
+ link_share = (cluster_data['message'].str.contains('http').mean()) * 100
303
+ media_share = (cluster_data['message'].str.contains('<media omitted>').mean()) * 100
304
+ if link_share > 50:
305
+ anomalies[cluster_id] = f"{round(link_share)}% of messages contain links."
306
+ elif media_share > 50:
307
+ anomalies[cluster_id] = f"{round(media_share)}% of messages are media files."
308
+ return anomalies
309
+
310
+ def generate_recommendations(df):
311
+ """
312
+ Generate actionable recommendations based on cluster insights.
313
+ """
314
+ recommendations = []
315
+ for cluster_id in df['cluster'].unique():
316
+ cluster_data = df[df['cluster'] == cluster_id]
317
+ if not cluster_data.empty:
318
+ sentiment_counts = cluster_data['sentiment'].value_counts(normalize=True) * 100
319
+ if sentiment_counts.get('negative', 0) > 50:
320
+ recommendations.append(f"Address negative sentiment in Cluster {cluster_id} by revisiting feedback processes.")
321
+ if cluster_data['message'].str.contains('http').mean() > 0.5:
322
+ recommendations.append(f"Pin resources from Cluster {cluster_id} (most-shared links) for easy access.")
323
+ return recommendations
preprocessor.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+ import spacy
4
+ from langdetect import detect_langs
5
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
6
+ from sklearn.decomposition import LatentDirichletAllocation
7
+ from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
8
+ from spacy.lang.fr.stop_words import STOP_WORDS as FRENCH_STOP_WORDS
9
+ from sklearn.cluster import KMeans
10
+ from sklearn.manifold import TSNE
11
+ import numpy as np
12
+ import torch
13
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
14
+ import streamlit as st
15
+
16
+ # Lighter model
17
+ MODEL ="cardiffnlp/twitter-xlm-roberta-base-sentiment"
18
+
19
+ # Cache model loading with fallback for quantization
20
+ @st.cache_resource
21
+ def load_model():
22
+ device = "cuda" if torch.cuda.is_available() else "cpu"
23
+ print(f"Using device: {device}")
24
+ tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
25
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)
26
+
27
+ # Attempt quantization with fallback
28
+ try:
29
+ # Set quantization engine explicitly (fbgemm for x86, qnnpack for ARM)
30
+ torch.backends.quantized.engine = 'fbgemm' if torch.cuda.is_available() else 'qnnpack'
31
+ model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
32
+ print("Model quantized successfully.")
33
+ except RuntimeError as e:
34
+ print(f"Quantization failed: {e}. Using non-quantized model.")
35
+
36
+ config = AutoConfig.from_pretrained(MODEL)
37
+ return tokenizer, model, config, device
38
+
39
+ tokenizer, model, config, device = load_model()
40
+
41
+ nlp_fr = spacy.load("fr_core_news_sm")
42
+ nlp_en = spacy.load("en_core_web_sm")
43
+ custom_stop_words = list(ENGLISH_STOP_WORDS.union(FRENCH_STOP_WORDS))
44
+
45
+ def preprocess(text):
46
+ if text is None:
47
+ return ""
48
+ if not isinstance(text, str):
49
+ try:
50
+ text = str(text)
51
+ except:
52
+ return ""
53
+ new_text = []
54
+ for t in text.split(" "):
55
+ t = '@user' if t.startswith('@') and len(t) > 1 else t
56
+ t = 'http' if t.startswith('http') else t
57
+ new_text.append(t)
58
+ return " ".join(new_text)
59
+
60
+ def clean_message(text):
61
+ if not isinstance(text, str):
62
+ return ""
63
+ text = text.lower()
64
+ text = text.replace("<media omitted>", "").replace("this message was deleted", "").replace("null", "")
65
+ text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
66
+ text = re.sub(r"[^a-zA-ZÀ-ÿ0-9\s]", "", text)
67
+ return text.strip()
68
+
69
+ def lemmatize_text(text, lang):
70
+ if lang == 'fr':
71
+ doc = nlp_fr(text)
72
+ else:
73
+ doc = nlp_en(text)
74
+ return " ".join([token.lemma_ for token in doc if not token.is_punct])
75
+
76
+ def preprocess(data):
77
+ pattern = r"^(?P<Date>\d{1,2}/\d{1,2}/\d{2,4}),\s+(?P<Time>[\d:]+(?:\S*\s?[AP]M)?)\s+-\s+(?:(?P<Sender>.*?):\s+)?(?P<Message>.*)$"
78
+ filtered_messages, valid_dates = [], []
79
+
80
+ for line in data.strip().split("\n"):
81
+ match = re.match(pattern, line)
82
+ if match:
83
+ entry = match.groupdict()
84
+ sender = entry.get("Sender")
85
+ if sender and sender.strip().lower() != "system":
86
+ filtered_messages.append(f"{sender.strip()}: {entry['Message']}")
87
+ valid_dates.append(f"{entry['Date']}, {entry['Time'].replace(' ', ' ')}")
88
+
89
+ df = pd.DataFrame({'user_message': filtered_messages, 'message_date': valid_dates})
90
+ df['message_date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %I:%M %p', errors='coerce')
91
+ df.rename(columns={'message_date': 'date'}, inplace=True)
92
+
93
+ users, messages = [], []
94
+ msg_pattern = r"^(.*?):\s(.*)$"
95
+ for message in df["user_message"]:
96
+ match = re.match(msg_pattern, message)
97
+ if match:
98
+ users.append(match.group(1))
99
+ messages.append(match.group(2))
100
+ else:
101
+ users.append("group_notification")
102
+ messages.append(message)
103
+
104
+ df["user"] = users
105
+ df["message"] = messages
106
+ df = df[df["user"] != "group_notification"].reset_index(drop=True)
107
+ df["unfiltered_messages"] = df["message"]
108
+ df["message"] = df["message"].apply(clean_message)
109
+
110
+ # Extract time-based features
111
+ df['year'] = pd.to_numeric(df['date'].dt.year, downcast='integer')
112
+ df['month'] = df['date'].dt.month_name()
113
+ df['day'] = pd.to_numeric(df['date'].dt.day, downcast='integer')
114
+ df['hour'] = pd.to_numeric(df['date'].dt.hour, downcast='integer')
115
+ df['day_of_week'] = df['date'].dt.day_name()
116
+
117
+ # Lemmatize messages for topic modeling
118
+ lemmatized_messages = []
119
+ for message in df["message"]:
120
+ try:
121
+ lang = detect_langs(message)
122
+ lemmatized_messages.append(lemmatize_text(message, lang))
123
+ except:
124
+ lemmatized_messages.append("")
125
+ df["lemmatized_message"] = lemmatized_messages
126
+
127
+ df = df[df["message"].notnull() & (df["message"] != "")].copy()
128
+ df.drop(columns=["user_message"], inplace=True)
129
+
130
+ # Perform topic modeling
131
+ vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words)
132
+ dtm = vectorizer.fit_transform(df['lemmatized_message'])
133
+
134
+ # Apply LDA
135
+ lda = LatentDirichletAllocation(n_components=5, random_state=42)
136
+ lda.fit(dtm)
137
+
138
+ # Assign topics to messages
139
+ topic_results = lda.transform(dtm)
140
+ df = df.iloc[:topic_results.shape[0]].copy()
141
+ df['topic'] = topic_results.argmax(axis=1)
142
+
143
+ # Store topics for visualization
144
+ topics = []
145
+ for topic in lda.components_:
146
+ topics.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
147
+ print("Top words for each topic-----------------------------------------------------:")
148
+ print(topics)
149
+
150
+ return df, topics
151
+
152
+ def preprocess_for_clustering(df, n_clusters=5):
153
+ df = df[df["lemmatized_message"].notnull() & (df["lemmatized_message"].str.strip() != "")]
154
+ df = df.reset_index(drop=True)
155
+
156
+ vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
157
+ tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message'])
158
+
159
+ if tfidf_matrix.shape[0] < 2:
160
+ raise ValueError("Not enough messages for clustering.")
161
+
162
+ df = df.iloc[:tfidf_matrix.shape[0]].copy()
163
+
164
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
165
+ clusters = kmeans.fit_predict(tfidf_matrix)
166
+
167
+ df['cluster'] = clusters
168
+ tsne = TSNE(n_components=2, random_state=42)
169
+ reduced_features = tsne.fit_transform(tfidf_matrix.toarray())
170
+
171
+ return df, reduced_features, kmeans.cluster_centers_
172
+
173
+
174
+ def predict_sentiment_batch(texts: list, batch_size: int = 32) -> list:
175
+ """Predict sentiment for a batch of texts"""
176
+ if not isinstance(texts, list):
177
+ raise TypeError(f"Expected list of texts, got {type(texts)}")
178
+
179
+ processed_texts = [preprocess(text) for text in texts]
180
+
181
+ predictions = []
182
+ for i in range(0, len(processed_texts), batch_size):
183
+ batch = processed_texts[i:i+batch_size]
184
+
185
+ inputs = tokenizer(
186
+ batch,
187
+ padding=True,
188
+ truncation=True,
189
+ return_tensors="pt",
190
+ max_length=128
191
+ ).to(device)
192
+
193
+ with torch.no_grad():
194
+ outputs = model(**inputs)
195
+
196
+ batch_preds = outputs.logits.argmax(dim=1).cpu().numpy()
197
+ predictions.extend([config.id2label[p] for p in batch_preds])
198
+
199
+ return predictions
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ preprocessor
3
+ matplotlib
4
+ seaborn
5
+ urlextract
6
+ wordcloud
7
+ pandas
8
+ emoji
9
+ langdetect
10
+ tiktoken
11
+ googletrans
12
+ transformers==4.44.2
13
+ torch==2.4.0
14
+ sentencepiece==0.2.0
15
+ protobuf==5.28.0
16
+ scikit-learn
17
+ plotly
18
+ nltk
19
+ spacy==3.7.0
20
+ thinc>=8.1.8,<8.3.0
21
+ deep_translator
22
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl
23
+ https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl
sentiment.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import time
3
+ import torch
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
5
+
6
+ # Use a sentiment-specific model (replace with TinyBERT if fine-tuned)
7
+ MODEL = "tabularisai/multilingual-sentiment-analysis" # Pre-trained for positive/negative sentiment
8
+
9
+ print("Loading model and tokenizer...")
10
+ start_load = time.time()
11
+
12
+ # Check for MPS (Metal) availability on M2 chip, fallback to CPU
13
+ device = "mps" if torch.backends.mps.is_available() else "cpu"
14
+ print(f"Using device: {device}")
15
+
16
+ # Load with optimizations (only once, removing redundancy)
17
+ tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
18
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)
19
+ config = AutoConfig.from_pretrained(MODEL)
20
+
21
+ load_time = time.time() - start_load
22
+ print(f"Model and tokenizer loaded in {load_time:.2f} seconds\n")
23
+
24
+ # Optimized preprocessing (unchanged from your code)
25
+ def preprocess(text):
26
+ if not isinstance(text, str):
27
+ text = str(text) if not pd.isna(text) else ""
28
+
29
+ new_text = []
30
+ for t in text.split(" "):
31
+ t = '@user' if t.startswith('@') and len(t) > 1 else t
32
+ t = 'http' if t.startswith('http') else t
33
+ new_text.append(t)
34
+ return " ".join(new_text)
35
+
36
+ # Batch prediction function (optimized for performance)
37
+ def predict_sentiment_batch(texts: list, batch_size: int = 16) -> list:
38
+ if not isinstance(texts, list):
39
+ raise TypeError(f"Expected list of texts, got {type(texts)}")
40
+
41
+ # Validate and clean inputs
42
+ valid_texts = [str(text) for text in texts if isinstance(text, str) and text.strip()]
43
+ if not valid_texts:
44
+ return [] # Return empty list if no valid texts
45
+
46
+ print(f"Processing {len(valid_texts)} valid samples...")
47
+ processed_texts = [preprocess(text) for text in valid_texts]
48
+
49
+ predictions = []
50
+ for i in range(0, len(processed_texts), batch_size):
51
+ batch = processed_texts[i:i + batch_size]
52
+ try:
53
+ inputs = tokenizer(
54
+ batch,
55
+ padding=True,
56
+ truncation=True,
57
+ return_tensors="pt",
58
+ max_length=64 # Reduced for speed on short texts like tweets
59
+ ).to(device)
60
+
61
+ with torch.no_grad():
62
+ outputs = model(**inputs)
63
+
64
+ batch_preds = outputs.logits.argmax(dim=1).cpu().numpy()
65
+ predictions.extend([config.id2label[p] for p in batch_preds])
66
+ except Exception as e:
67
+ print(f"Error processing batch {i // batch_size}: {str(e)}")
68
+ predictions.extend(["neutral"] * len(batch)) # Consider logging instead
69
+
70
+ print(f"Predictions for {len(valid_texts)} samples generated in {time.time() - start_load:.2f} seconds")
71
+ predictions = [prediction.lower().replace("very ", "") for prediction in predictions]
72
+
73
+ print(predictions)
74
+
75
+ return predictions
76
+
77
+ # # Example usage with your dataset (uncomment and adjust paths)
78
+ # test_data = pd.read_csv("/Users/caasidev/development/AI/last try/Whatssap-project/srcs/tweets.csv")
79
+ # print(f"Processing {len(test_data)} samples...")
80
+ # start_prediction = time.time()
81
+
82
+ # text_samples = test_data['text'].tolist()
83
+ # test_data['predicted_sentiment'] = predict_sentiment_batch(text_samples)
84
+
85
+ # prediction_time = time.time() - start_prediction
86
+ # time_per_sample = prediction_time / len(test_data)
87
+
88
+ # # Print runtime statistics
89
+ # print("\nRuntime Statistics:")
90
+ # print(f"- Model loading time: {load_time:.2f} seconds")
91
+ # print(f"- Total prediction time for {len(test_data)} samples: {prediction_time:.2f} seconds")
92
+ # print(f"- Average time per sample: {time_per_sample:.4f} seconds")
93
+ # print(f"- Estimated time for 1000 samples: {(time_per_sample * 1000):.2f} seconds")
94
+ # print(f"- Estimated time for 20000 samples: {(time_per_sample * 20000 / 60):.2f} minutes")
95
+
96
+ # # Print a sample of predictions
97
+ # print("\nPredicted Sentiments (first 5 samples):")
98
+ # print(test_data[['text', 'predicted_sentiment']].head())