Spaces:

hansche
/

SocialMediaFoci

Sleeping

App Files Files Community

uploaded 5files

by afanyu237 - opened Apr 10

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+1081

-0

Files changed (5) hide show

app.py +438 -0
helper.py +323 -0
preprocessor.py +199 -0
requirements.txt +23 -0
sentiment.py +98 -0

app.py CHANGED Viewed

	@@ -0,0 +1,438 @@

+import streamlit as st
+st.set_page_config(page_title="WhatsApp Chat Analyzer", layout="wide")
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import preprocessor, helper
+from sentiment import predict_sentiment_batch
+import os
+os.environ["STREAMLIT_SERVER_RUN_ON_SAVE"] = "false"
+# Theme customization
+st.markdown(
+    """
+    <style>
+    .main {background-color: #f0f2f6;}
+    </style>
+    """,
+    unsafe_allow_html=True
+)
+# Set seaborn style
+sns.set_theme(style="whitegrid")
+st.title("📊 WhatsApp Chat Sentiment Analysis Dashboard")
+st.subheader('Instructions')
+st.markdown("1. Open the sidebar and upload your WhatsApp chat file in .txt format.")
+st.markdown("2. Wait for the initial processing (minimal delay).")
+st.markdown("3. Customize the analysis by selecting users or filters.")
+st.markdown("4. Click 'Show Analysis' for detailed results.")
+st.sidebar.title("Whatsapp Chat Analyzer")
+uploaded_file = st.sidebar.file_uploader("Upload your chat file (.txt)", type="txt")
+@st.cache_data
+def load_and_preprocess(file_content):
+    return preprocessor.preprocess(file_content)
+if uploaded_file is not None:
+    raw_data = uploaded_file.read().decode("utf-8")
+    with st.spinner("Loading chat data..."):
+        df, _ = load_and_preprocess(raw_data)
+    st.session_state.df = df
+    st.sidebar.header("🔍 Filters")
+    user_list = ["Overall"] + sorted(df["user"].unique().tolist())
+    selected_user = st.sidebar.selectbox("Select User", user_list)
+    df_filtered = df if selected_user == "Overall" else df[df["user"] == selected_user]
+    if st.sidebar.button("Show Analysis"):
+        if df_filtered.empty:
+            st.warning(f"No data found for user: {selected_user}")
+        else:
+            with st.spinner("Analyzing..."):
+                if 'sentiment' not in df_filtered.columns:
+                    try:
+                        print("Starting sentiment analysis...")
+                        # Get messages as clean strings
+                        message_list = df_filtered["message"].astype(str).tolist()
+                        message_list = [msg for msg in message_list if msg.strip()]
+                        print(f"Processing {len(message_list)} messages")
+                        print(f"Sample messages: {message_list[:5]}")
+                        # Directly call the sentiment analysis function
+                        df_filtered['sentiment'] = predict_sentiment_batch(message_list)
+                        print("Sentiment analysis completed successfully")
+                    except Exception as e:
+                        st.error(f"Sentiment analysis failed: {str(e)}")
+                        print(f"Full error: {str(e)}")
+                    st.session_state.df_filtered = df_filtered
+                else:
+                    st.session_state.df_filtered = df_filtered
+                # Display statistics and visualizations
+                num_messages, words, num_media, num_links = helper.fetch_stats(selected_user, df_filtered)
+                st.title("Top Statistics")
+                col1, col2, col3, col4 = st.columns(4)
+                with col1:
+                    st.header("Total Messages")
+                    st.title(num_messages)
+                with col2:
+                    st.header("Total Words")
+                    st.title(words)
+                with col3:
+                    st.header("Media Shared")
+                    st.title(num_media)
+                with col4:
+                    st.header("Links Shared")
+                    st.title(num_links)
+                st.title("Monthly Timeline")
+                timeline = helper.monthly_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered))))
+                if not timeline.empty:
+                    plt.figure(figsize=(10, 5))
+                    sns.lineplot(data=timeline, x='time', y='message', color='green')
+                    plt.title("Monthly Timeline")
+                    plt.xlabel("Date")
+                    plt.ylabel("Messages")
+                    st.pyplot(plt)
+                    plt.clf()
+                st.title("Daily Timeline")
+                daily_timeline = helper.daily_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered))))
+                if not daily_timeline.empty:
+                    plt.figure(figsize=(10, 5))
+                    sns.lineplot(data=daily_timeline, x='date', y='message', color='black')
+                    plt.title("Daily Timeline")
+                    plt.xlabel("Date")
+                    plt.ylabel("Messages")
+                    st.pyplot(plt)
+                    plt.clf()
+                st.title("Activity Map")
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.header("Most Busy Day")
+                    busy_day = helper.week_activity_map(selected_user, df_filtered)
+                    if not busy_day.empty:
+                        plt.figure(figsize=(10, 5))
+                        sns.barplot(x=busy_day.index, y=busy_day.values, palette="Purples_r")
+                        plt.title("Most Busy Day")
+                        plt.xlabel("Day of Week")
+                        plt.ylabel("Message Count")
+                        st.pyplot(plt)
+                        plt.clf()
+                with col2:
+                    st.header("Most Busy Month")
+                    busy_month = helper.month_activity_map(selected_user, df_filtered)
+                    if not busy_month.empty:
+                        plt.figure(figsize=(10, 5))
+                        sns.barplot(x=busy_month.index, y=busy_month.values, palette="Oranges_r")
+                        plt.title("Most Busy Month")
+                        plt.xlabel("Month")
+                        plt.ylabel("Message Count")
+                        st.pyplot(plt)
+                        plt.clf()
+                if selected_user == 'Overall':
+                    st.title("Most Busy Users")
+                    x, new_df = helper.most_busy_users(df_filtered)
+                    if not x.empty:
+                        plt.figure(figsize=(10, 5))
+                        sns.barplot(x=x.index, y=x.values, palette="Reds_r")
+                        plt.title("Most Busy Users")
+                        plt.xlabel("User")
+                        plt.ylabel("Message Count")
+                        plt.xticks(rotation=45)
+                        st.pyplot(plt)
+                        st.title("Word Count by User")
+                        plt.clf()
+                        st.dataframe(new_df)
+                # Most common words analysis
+                st.title("Most Common Words")
+                most_common_df = helper.most_common_words(selected_user, df_filtered)
+                if not most_common_df.empty:
+                    fig, ax = plt.subplots(figsize=(10, 6))
+                    sns.barplot(y=most_common_df[0], x=most_common_df[1], ax=ax, palette="Blues_r")
+                    ax.set_title("Top 20 Most Common Words")
+                    ax.set_xlabel("Frequency")
+                    ax.set_ylabel("Words")
+                    plt.xticks(rotation='vertical')
+                    st.pyplot(fig)
+                    plt.clf()
+                else:
+                    st.warning("No data available for most common words.")
+                # Emoji analysis
+                st.title("Emoji Analysis")
+                emoji_df = helper.emoji_helper(selected_user, df_filtered)
+                if not emoji_df.empty:
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        st.subheader("Top Emojis Used")
+                        st.dataframe(emoji_df)
+                    with col2:
+                        fig, ax = plt.subplots(figsize=(8, 8))
+                        ax.pie(emoji_df[1].head(), labels=emoji_df[0].head(),
+                              autopct="%0.2f%%", startangle=90,
+                              colors=sns.color_palette("pastel"))
+                        ax.set_title("Top Emoji Distribution")
+                        st.pyplot(fig)
+                        plt.clf()
+                else:
+                    st.warning("No data available for emoji analysis.")
+                # Sentiment Analysis Visualizations
+                st.title("📈 Sentiment Analysis")
+                # Convert month names to abbreviated format
+                month_map = {
+                    'January': 'Jan', 'February': 'Feb', 'March': 'Mar', 'April': 'Apr',
+                    'May': 'May', 'June': 'Jun', 'July': 'Jul', 'August': 'Aug',
+                    'September': 'Sep', 'October': 'Oct', 'November': 'Nov', 'December': 'Dec'
+                }
+                df_filtered['month'] = df_filtered['month'].map(month_map)
+                # Group by month and sentiment
+                monthly_sentiment = df_filtered.groupby(['month', 'sentiment']).size().unstack(fill_value=0)
+                # Plotting: Histogram (Bar Chart) for each sentiment
+                st.write("### Sentiment Count by Month (Histogram)")
+                # Create a figure with subplots for each sentiment
+                fig, axes = plt.subplots(1, 3, figsize=(18, 5))
+                # Plot Positive Sentiment
+                if 'positive' in monthly_sentiment:
+                    axes[0].bar(monthly_sentiment.index, monthly_sentiment['positive'], color='green')
+                axes[0].set_title('Positive Sentiment')
+                axes[0].set_xlabel('Month')
+                axes[0].set_ylabel('Count')
+                # Plot Neutral Sentiment
+                if 'neutral' in monthly_sentiment:
+                    axes[1].bar(monthly_sentiment.index, monthly_sentiment['neutral'], color='blue')
+                axes[1].set_title('Neutral Sentiment')
+                axes[1].set_xlabel('Month')
+                axes[1].set_ylabel('Count')
+                # Plot Negative Sentiment
+                if 'negative' in monthly_sentiment:
+                    axes[2].bar(monthly_sentiment.index, monthly_sentiment['negative'], color='red')
+                axes[2].set_title('Negative Sentiment')
+                axes[2].set_xlabel('Month')
+                axes[2].set_ylabel('Count')
+                # Display the plots in Streamlit
+                st.pyplot(fig)
+                plt.clf()
+                # Count sentiments per day of the week
+                sentiment_counts = df_filtered.groupby(['day_of_week', 'sentiment']).size().unstack(fill_value=0)
+                # Sort days correctly
+                day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
+                sentiment_counts = sentiment_counts.reindex(day_order)
+                # Daily Sentiment Analysis
+                st.write("### Daily Sentiment Analysis")
+                # Create a Matplotlib figure
+                fig, ax = plt.subplots(figsize=(10, 5))
+                sentiment_counts.plot(kind='bar', stacked=False, ax=ax, color=['red', 'blue', 'green'])
+                # Customize the plot
+                ax.set_xlabel("Day of the Week")
+                ax.set_ylabel("Count")
+                ax.set_title("Sentiment Distribution per Day of the Week")
+                ax.legend(title="Sentiment")
+                # Display the plot in Streamlit
+                st.pyplot(fig)
+                plt.clf()
+                # Count messages per user per sentiment (only for Overall view)
+                if selected_user == 'Overall':
+                    sentiment_counts = df_filtered.groupby(['user', 'sentiment']).size().reset_index(name='Count')
+                    # Calculate total messages per sentiment
+                    total_per_sentiment = df_filtered['sentiment'].value_counts().to_dict()
+                    # Add percentage column
+                    sentiment_counts['Percentage'] = sentiment_counts.apply(
+                        lambda row: (row['Count'] / total_per_sentiment[row['sentiment']]) * 100, axis=1
+                    )
+                    # Separate tables for each sentiment
+                    positive_df = sentiment_counts[sentiment_counts['sentiment'] == 'positive'].sort_values(by='Count', ascending=False).head(10)
+                    neutral_df = sentiment_counts[sentiment_counts['sentiment'] == 'neutral'].sort_values(by='Count', ascending=False).head(10)
+                    negative_df = sentiment_counts[sentiment_counts['sentiment'] == 'negative'].sort_values(by='Count', ascending=False).head(10)
+                    # Sentiment Contribution Analysis
+                    st.write("### Sentiment Contribution by User")
+                    # Create three columns for side-by-side display
+                    col1, col2, col3 = st.columns(3)
+                    # Display Positive Table
+                    with col1:
+                        st.subheader("Top Positive Contributors")
+                        if not positive_df.empty:
+                            st.dataframe(positive_df[['user', 'Count', 'Percentage']])
+                        else:
+                            st.warning("No positive sentiment data")
+                    # Display Neutral Table
+                    with col2:
+                        st.subheader("Top Neutral Contributors")
+                        if not neutral_df.empty:
+                            st.dataframe(neutral_df[['user', 'Count', 'Percentage']])
+                        else:
+                            st.warning("No neutral sentiment data")
+                    # Display Negative Table
+                    with col3:
+                        st.subheader("Top Negative Contributors")
+                        if not negative_df.empty:
+                            st.dataframe(negative_df[['user', 'Count', 'Percentage']])
+                        else:
+                            st.warning("No negative sentiment data")
+                             # Topic Analysis Section
+                st.title("🔍 Area of Focus: Topic Analysis")
+                # Check if topic column exists, otherwise perform topic modeling
+                # if 'topic' not in df_filtered.columns:
+                #     with st.spinner("Performing topic modeling..."):
+                #         try:
+                #             # Add topic modeling here or ensure your helper functions handle it
+                #             df_filtered = helper.perform_topic_modeling(df_filtered)
+                #         except Exception as e:
+                #             st.error(f"Topic modeling failed: {str(e)}")
+                #             st.stop()
+                # Plot Topic Distribution
+                st.header("Topic Distribution")
+                try:
+                    fig = helper.plot_topic_distribution(df_filtered)
+                    st.pyplot(fig)
+                    plt.clf()
+                except Exception as e:
+                    st.warning(f"Could not display topic distribution: {str(e)}")
+                # Display Sample Messages for Each Topic
+                st.header("Sample Messages for Each Topic")
+                if 'topic' in df_filtered.columns:
+                    for topic_id in sorted(df_filtered['topic'].unique()):
+                        st.subheader(f"Topic {topic_id}")
+                        # Get messages for the current topic
+                        filtered_messages = df_filtered[df_filtered['topic'] == topic_id]['message']
+                        # Determine sample size
+                        sample_size = min(5, len(filtered_messages))
+                        if sample_size > 0:
+                            sample_messages = filtered_messages.sample(sample_size, replace=False).tolist()
+                            for msg in sample_messages:
+                                st.write(f"- {msg}")
+                        else:
+                            st.write("No messages available for this topic.")
+                else:
+                    st.warning("Topic information not available")
+                # Topic Distribution Over Time
+                st.header("📅 Topic Trends Over Time")
+                # Add time frequency selector
+                time_freq = st.selectbox("Select Time Frequency", ["Daily", "Weekly", "Monthly"], key='time_freq')
+                # Plot topic trends
+                try:
+                    freq_map = {"Daily": "D", "Weekly": "W", "Monthly": "M"}
+                    topic_distribution = helper.topic_distribution_over_time(df_filtered, time_freq=freq_map[time_freq])
+                    # Choose between static and interactive plot
+                    use_plotly = st.checkbox("Use interactive visualization", value=True, key='use_plotly')
+                    if use_plotly:
+                        fig = helper.plot_topic_distribution_over_time_plotly(topic_distribution)
+                        st.plotly_chart(fig, use_container_width=True)
+                    else:
+                        fig = helper.plot_topic_distribution_over_time(topic_distribution)
+                        st.pyplot(fig)
+                        plt.clf()
+                except Exception as e:
+                    st.warning(f"Could not display topic trends: {str(e)}")
+                # Clustering Analysis Section
+                st.title("🧩 Conversation Clusters")
+                # Number of clusters input
+                n_clusters = st.slider("Select number of clusters",
+                                       min_value=2,
+                                       max_value=10,
+                                       value=5,
+                                       key='n_clusters')
+                # Perform clustering
+                with st.spinner("Analyzing conversation clusters..."):
+                    try:
+                        df_clustered, reduced_features, _ = preprocessor.preprocess_for_clustering(df_filtered, n_clusters=n_clusters)
+                        # Plot clusters
+                        st.header("Cluster Visualization")
+                        fig = helper.plot_clusters(reduced_features, df_clustered['cluster'])
+                        st.pyplot(fig)
+                        plt.clf()
+                        # Cluster Insights
+                        st.header("📌 Cluster Insights")
+                        # 1. Dominant Conversation Themes
+                        st.subheader("1. Dominant Themes")
+                        cluster_labels = helper.get_cluster_labels(df_clustered, n_clusters)
+                        for cluster_id, label in cluster_labels.items():
+                            st.write(f"**Cluster {cluster_id}**: {label}")
+                        # 2. Temporal Patterns
+                        st.subheader("2. Temporal Patterns")
+                        temporal_trends = helper.get_temporal_trends(df_clustered)
+                        for cluster_id, trend in temporal_trends.items():
+                            st.write(f"**Cluster {cluster_id}**: Peaks on {trend['peak_day']} around {trend['peak_time']}")
+                        # 3. User Contributions
+                        if selected_user == 'Overall':
+                            st.subheader("3. Top Contributors")
+                            user_contributions = helper.get_user_contributions(df_clustered)
+                            for cluster_id, users in user_contributions.items():
+                                st.write(f"**Cluster {cluster_id}**: {', '.join(users[:3])}...")
+                        # 4. Sentiment by Cluster
+                        st.subheader("4. Sentiment Analysis")
+                        sentiment_by_cluster = helper.get_sentiment_by_cluster(df_clustered)
+                        for cluster_id, sentiment in sentiment_by_cluster.items():
+                            st.write(f"**Cluster {cluster_id}**: {sentiment['positive']}% positive, {sentiment['neutral']}% neutral, {sentiment['negative']}% negative")
+                        # Sample messages from each cluster
+                        st.subheader("Sample Messages")
+                        for cluster_id in sorted(df_clustered['cluster'].unique()):
+                            with st.expander(f"Cluster {cluster_id} Messages"):
+                                cluster_msgs = df_clustered[df_clustered['cluster'] == cluster_id]['message']
+                                sample_size = min(3, len(cluster_msgs))
+                                if sample_size > 0:
+                                    for msg in cluster_msgs.sample(sample_size, replace=False):
+                                        st.write(f"- {msg}")
+                                else:
+                                    st.write("No messages available")
+                    except Exception as e:
+                        st.error(f"Clustering failed: {str(e)}")

helper.py ADDED Viewed

	@@ -0,0 +1,323 @@

+from urlextract import URLExtract
+from wordcloud import WordCloud
+import pandas as pd
+from collections import Counter
+import emoji
+import plotly.express as px
+import matplotlib.pyplot as plt
+import seaborn as sns
+extract = URLExtract()
+def fetch_stats(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    num_messages = df.shape[0]
+    words = sum(len(msg.split()) for msg in df['message'])
+    num_media_messages = df[df['unfiltered_messages'] == '<media omitted>\n'].shape[0]
+    links = sum(len(extract.find_urls(msg)) for msg in df['unfiltered_messages'])
+    return num_messages, words, num_media_messages, links
+def most_busy_users(df):
+    x = df['user'].value_counts().head()
+    df = round((df['user'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename(
+        columns={'index': 'percentage', 'user': 'Name'})
+    return x, df
+def create_wordcloud(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    temp = df[df['user'] != 'group_notification']
+    temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')]
+    wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white')
+    df_wc = wc.generate(temp['message'].str.cat(sep=" "))
+    return df_wc
+def most_common_words(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    temp = df[df['user'] != 'group_notification']
+    temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')]
+    words = [word for msg in temp['message'] for word in msg.lower().split()]
+    return pd.DataFrame(Counter(words).most_common(20))
+def emoji_helper(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    emojis = [c for msg in df['unfiltered_messages'] for c in msg if c in emoji.EMOJI_DATA]
+    return pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
+def monthly_timeline(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    timeline = df.groupby(['year', 'month']).count()['message'].reset_index()
+    timeline['time'] = timeline['month'] + "-" + timeline['year'].astype(str)
+    return timeline
+def daily_timeline(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    return df.groupby('date').count()['message'].reset_index()
+def week_activity_map(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    return df['day_of_week'].value_counts()
+def month_activity_map(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    return df['month'].value_counts()
+def plot_topic_distribution(df):
+    topic_counts = df['topic'].value_counts().sort_index()
+    fig = px.bar(x=topic_counts.index, y=topic_counts.values, title="Topic Distribution", color_discrete_sequence=['viridis'])
+    return fig
+def topic_distribution_over_time(df, time_freq='M'):
+    df['time_period'] = df['date'].dt.to_period(time_freq)
+    return df.groupby(['time_period', 'topic']).size().unstack(fill_value=0)
+def plot_topic_distribution_over_time_plotly(topic_distribution):
+    topic_distribution = topic_distribution.reset_index()
+    topic_distribution['time_period'] = topic_distribution['time_period'].dt.to_timestamp()
+    topic_distribution = topic_distribution.melt(id_vars='time_period', var_name='topic', value_name='count')
+    fig = px.line(topic_distribution, x='time_period', y='count', color='topic', title="Topic Distribution Over Time")
+    fig.update_layout(legend_title_text='Topics', xaxis_tickangle=-45)
+    return fig
+def plot_clusters(reduced_features, clusters):
+    fig = px.scatter(x=reduced_features[:, 0], y=reduced_features[:, 1], color=clusters, title="Message Clusters (t-SNE)")
+    return fig
+def most_common_words(selected_user, df):
+    # f = open('stop_hinglish.txt','r')
+    stop_words = df
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    temp = df[df['user'] != 'group_notification']
+    temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')]
+    words = []
+    for message in temp['message']:
+        for word in message.lower().split():
+            if word not in stop_words:
+                words.append(word)
+    most_common_df = pd.DataFrame(Counter(words).most_common(20))
+    return most_common_df
+def emoji_helper(selected_user, df):
+    if selected_user != 'Overall':
+        df = df[df['user'] == selected_user]
+    emojis = []
+    for message in df['unfiltered_messages']:
+        emojis.extend([c for c in message if c in emoji.EMOJI_DATA])
+    emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
+    return emoji_df
+def plot_topic_distribution(df):
+    """
+    Plots the distribution of topics in the chat data.
+    """
+    topic_counts = df['topic'].value_counts().sort_index()
+    fig, ax = plt.subplots()
+    sns.barplot(x=topic_counts.index, y=topic_counts.values, ax=ax, palette="viridis")
+    ax.set_title("Topic Distribution")
+    ax.set_xlabel("Topic")
+    ax.set_ylabel("Number of Messages")
+    return fig
+def most_frequent_keywords(messages, top_n=10):
+    """
+    Extracts the most frequent keywords from a list of messages.
+    """
+    words = [word for msg in messages for word in msg.split()]
+    word_freq = Counter(words)
+    return word_freq.most_common(top_n)
+def plot_topic_distribution_over_time(topic_distribution):
+    """
+    Plots the distribution of topics over time using a line chart.
+    """
+    fig, ax = plt.subplots(figsize=(12, 6))
+    # Plot each topic as a separate line
+    for topic in topic_distribution.columns:
+        ax.plot(topic_distribution.index.to_timestamp(), topic_distribution[topic], label=f"Topic {topic}")
+    ax.set_title("Topic Distribution Over Time")
+    ax.set_xlabel("Time Period")
+    ax.set_ylabel("Number of Messages")
+    ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.xticks(rotation=45)
+    plt.tight_layout()
+    return fig
+def plot_most_frequent_keywords(keywords):
+    """
+    Plots the most frequent keywords.
+    """
+    words, counts = zip(*keywords)
+    fig, ax = plt.subplots()
+    sns.barplot(x=list(counts), y=list(words), ax=ax, palette="viridis")
+    ax.set_title("Most Frequent Keywords")
+    ax.set_xlabel("Frequency")
+    ax.set_ylabel("Keyword")
+    return fig
+def topic_distribution_over_time(df, time_freq='M'):
+    """
+    Analyzes the distribution of topics over time.
+    """
+    # Group by time interval and topic
+    df['time_period'] = df['date'].dt.to_period(time_freq)
+    topic_distribution = df.groupby(['time_period', 'topic']).size().unstack(fill_value=0)
+    return topic_distribution
+def plot_topic_distribution_over_time(topic_distribution):
+    """
+    Plots the distribution of topics over time using a line chart.
+    """
+    fig, ax = plt.subplots(figsize=(12, 6))
+    # Plot each topic as a separate line
+    for topic in topic_distribution.columns:
+        ax.plot(topic_distribution.index.to_timestamp(), topic_distribution[topic], label=f"Topic {topic}")
+    ax.set_title("Topic Distribution Over Time")
+    ax.set_xlabel("Time Period")
+    ax.set_ylabel("Number of Messages")
+    ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.xticks(rotation=45)
+    plt.tight_layout()
+    return fig
+def plot_topic_distribution_over_time_plotly(topic_distribution):
+    """
+    Plots the distribution of topics over time using Plotly.
+    """
+    topic_distribution = topic_distribution.reset_index()
+    topic_distribution['time_period'] = topic_distribution['time_period'].dt.to_timestamp()
+    topic_distribution = topic_distribution.melt(id_vars='time_period', var_name='topic', value_name='count')
+    fig = px.line(topic_distribution, x='time_period', y='count', color='topic',
+                  title="Topic Distribution Over Time", labels={'time_period': 'Time Period', 'count': 'Number of Messages'})
+    fig.update_layout(legend_title_text='Topics', xaxis_tickangle=-45)
+    return fig
+def plot_clusters(reduced_features, clusters):
+    """
+    Visualize clusters using t-SNE.
+    Args:
+        reduced_features (np.array): 2D array of reduced features.
+        clusters (np.array): Cluster labels.
+    Returns:
+        fig (plt.Figure): Matplotlib figure object.
+    """
+    plt.figure(figsize=(10, 8))
+    sns.scatterplot(
+        x=reduced_features[:, 0],
+        y=reduced_features[:, 1],
+        hue=clusters,
+        palette="viridis",
+        legend="full"
+    )
+    plt.title("Message Clusters (t-SNE Visualization)")
+    plt.xlabel("t-SNE Component 1")
+    plt.ylabel("t-SNE Component 2")
+    plt.tight_layout()
+    return plt.gcf()
+def get_cluster_labels(df, n_clusters):
+    """
+    Generate descriptive labels for each cluster based on top keywords.
+    """
+    from sklearn.feature_extraction.text import TfidfVectorizer
+    import numpy as np
+    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
+    tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message'])
+    cluster_labels = {}
+    for cluster_id in range(n_clusters):
+        cluster_indices = df[df['cluster'] == cluster_id].index
+        if len(cluster_indices) > 0:
+            cluster_tfidf = tfidf_matrix[cluster_indices]
+            top_keywords = np.argsort(cluster_tfidf.sum(axis=0).A1)[-3:][::-1]
+            cluster_labels[cluster_id] = ", ".join(vectorizer.get_feature_names_out()[top_keywords])
+        else:
+            cluster_labels[cluster_id] = "No dominant theme"
+    return cluster_labels
+def get_temporal_trends(df):
+    """
+    Analyze temporal trends for each cluster (peak day and time).
+    """
+    temporal_trends = {}
+    for cluster_id in df['cluster'].unique():
+        cluster_data = df[df['cluster'] == cluster_id]
+        if not cluster_data.empty:
+            peak_day = cluster_data['day_of_week'].mode()[0]
+            peak_time = cluster_data['hour'].mode()[0]
+            temporal_trends[cluster_id] = {"peak_day": peak_day, "peak_time": f"{peak_time}:00"}
+    return temporal_trends
+def get_user_contributions(df):
+    """
+    Identify top contributors for each cluster.
+    """
+    user_contributions = {}
+    for cluster_id in df['cluster'].unique():
+        cluster_data = df[df['cluster'] == cluster_id]
+        if not cluster_data.empty:
+            top_users = cluster_data['user'].value_counts().head(3).index.tolist()
+            user_contributions[cluster_id] = top_users
+    return user_contributions
+def get_sentiment_by_cluster(df):
+    """
+    Analyze sentiment distribution for each cluster.
+    """
+    sentiment_by_cluster = {}
+    for cluster_id in df['cluster'].unique():
+        cluster_data = df[df['cluster'] == cluster_id]
+        if not cluster_data.empty:
+            sentiment_counts = cluster_data['sentiment'].value_counts(normalize=True) * 100
+            sentiment_by_cluster[cluster_id] = {
+                "positive": round(sentiment_counts.get('positive', 0)),
+                "neutral": round(sentiment_counts.get('neutral', 0)),
+                "negative": round(sentiment_counts.get('negative', 0))
+            }
+    return sentiment_by_cluster
+def detect_anomalies(df):
+    """
+    Detect anomalies in each cluster (e.g., high link or media share).
+    """
+    anomalies = {}
+    for cluster_id in df['cluster'].unique():
+        cluster_data = df[df['cluster'] == cluster_id]
+        if not cluster_data.empty:
+            link_share = (cluster_data['message'].str.contains('http').mean()) * 100
+            media_share = (cluster_data['message'].str.contains('<media omitted>').mean()) * 100
+            if link_share > 50:
+                anomalies[cluster_id] = f"{round(link_share)}% of messages contain links."
+            elif media_share > 50:
+                anomalies[cluster_id] = f"{round(media_share)}% of messages are media files."
+    return anomalies
+def generate_recommendations(df):
+    """
+    Generate actionable recommendations based on cluster insights.
+    """
+    recommendations = []
+    for cluster_id in df['cluster'].unique():
+        cluster_data = df[df['cluster'] == cluster_id]
+        if not cluster_data.empty:
+            sentiment_counts = cluster_data['sentiment'].value_counts(normalize=True) * 100
+            if sentiment_counts.get('negative', 0) > 50:
+                recommendations.append(f"Address negative sentiment in Cluster {cluster_id} by revisiting feedback processes.")
+            if cluster_data['message'].str.contains('http').mean() > 0.5:
+                recommendations.append(f"Pin resources from Cluster {cluster_id} (most-shared links) for easy access.")
+    return recommendations

preprocessor.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import re
+import pandas as pd
+import spacy
+from langdetect import detect_langs
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+from sklearn.decomposition import LatentDirichletAllocation
+from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
+from spacy.lang.fr.stop_words import STOP_WORDS as FRENCH_STOP_WORDS
+from sklearn.cluster import KMeans
+from sklearn.manifold import TSNE
+import numpy as np
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
+import streamlit as st
+# Lighter model
+MODEL ="cardiffnlp/twitter-xlm-roberta-base-sentiment"
+# Cache model loading with fallback for quantization
+@st.cache_resource
+def load_model():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)
+    # Attempt quantization with fallback
+    try:
+        # Set quantization engine explicitly (fbgemm for x86, qnnpack for ARM)
+        torch.backends.quantized.engine = 'fbgemm' if torch.cuda.is_available() else 'qnnpack'
+        model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
+        print("Model quantized successfully.")
+    except RuntimeError as e:
+        print(f"Quantization failed: {e}. Using non-quantized model.")
+    config = AutoConfig.from_pretrained(MODEL)
+    return tokenizer, model, config, device
+tokenizer, model, config, device = load_model()
+nlp_fr = spacy.load("fr_core_news_sm")
+nlp_en = spacy.load("en_core_web_sm")
+custom_stop_words = list(ENGLISH_STOP_WORDS.union(FRENCH_STOP_WORDS))
+def preprocess(text):
+    if text is None:
+        return ""
+    if not isinstance(text, str):
+        try:
+            text = str(text)
+        except:
+            return ""
+    new_text = []
+    for t in text.split(" "):
+        t = '@user' if t.startswith('@') and len(t) > 1 else t
+        t = 'http' if t.startswith('http') else t
+        new_text.append(t)
+    return " ".join(new_text)
+def clean_message(text):
+    if not isinstance(text, str):
+        return ""
+    text = text.lower()
+    text = text.replace("<media omitted>", "").replace("this message was deleted", "").replace("null", "")
+    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
+    text = re.sub(r"[^a-zA-ZÀ-ÿ0-9\s]", "", text)
+    return text.strip()
+def lemmatize_text(text, lang):
+    if lang == 'fr':
+        doc = nlp_fr(text)
+    else:
+        doc = nlp_en(text)
+    return " ".join([token.lemma_ for token in doc if not token.is_punct])
+def preprocess(data):
+    pattern = r"^(?P<Date>\d{1,2}/\d{1,2}/\d{2,4}),\s+(?P<Time>[\d:]+(?:\S*\s?[AP]M)?)\s+-\s+(?:(?P<Sender>.*?):\s+)?(?P<Message>.*)$"
+    filtered_messages, valid_dates = [], []
+    for line in data.strip().split("\n"):
+        match = re.match(pattern, line)
+        if match:
+            entry = match.groupdict()
+            sender = entry.get("Sender")
+            if sender and sender.strip().lower() != "system":
+                filtered_messages.append(f"{sender.strip()}: {entry['Message']}")
+                valid_dates.append(f"{entry['Date']}, {entry['Time'].replace('â€¯', ' ')}")
+    df = pd.DataFrame({'user_message': filtered_messages, 'message_date': valid_dates})
+    df['message_date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %I:%M %p', errors='coerce')
+    df.rename(columns={'message_date': 'date'}, inplace=True)
+    users, messages = [], []
+    msg_pattern = r"^(.*?):\s(.*)$"
+    for message in df["user_message"]:
+        match = re.match(msg_pattern, message)
+        if match:
+            users.append(match.group(1))
+            messages.append(match.group(2))
+        else:
+            users.append("group_notification")
+            messages.append(message)
+    df["user"] = users
+    df["message"] = messages
+    df = df[df["user"] != "group_notification"].reset_index(drop=True)
+    df["unfiltered_messages"] = df["message"]
+    df["message"] = df["message"].apply(clean_message)
+    # Extract time-based features
+    df['year'] = pd.to_numeric(df['date'].dt.year, downcast='integer')
+    df['month'] = df['date'].dt.month_name()
+    df['day'] = pd.to_numeric(df['date'].dt.day, downcast='integer')
+    df['hour'] = pd.to_numeric(df['date'].dt.hour, downcast='integer')
+    df['day_of_week'] = df['date'].dt.day_name()
+    # Lemmatize messages for topic modeling
+    lemmatized_messages = []
+    for message in df["message"]:
+        try:
+            lang = detect_langs(message)
+            lemmatized_messages.append(lemmatize_text(message, lang))
+        except:
+            lemmatized_messages.append("")
+    df["lemmatized_message"] = lemmatized_messages
+    df = df[df["message"].notnull() & (df["message"] != "")].copy()
+    df.drop(columns=["user_message"], inplace=True)
+    # Perform topic modeling
+    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words)
+    dtm = vectorizer.fit_transform(df['lemmatized_message'])
+    # Apply LDA
+    lda = LatentDirichletAllocation(n_components=5, random_state=42)
+    lda.fit(dtm)
+    # Assign topics to messages
+    topic_results = lda.transform(dtm)
+    df = df.iloc[:topic_results.shape[0]].copy()
+    df['topic'] = topic_results.argmax(axis=1)
+    # Store topics for visualization
+    topics = []
+    for topic in lda.components_:
+        topics.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
+    print("Top words for each topic-----------------------------------------------------:")
+    print(topics)
+    return df, topics
+def preprocess_for_clustering(df, n_clusters=5):
+    df = df[df["lemmatized_message"].notnull() & (df["lemmatized_message"].str.strip() != "")]
+    df = df.reset_index(drop=True)
+    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
+    tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message'])
+    if tfidf_matrix.shape[0] < 2:
+        raise ValueError("Not enough messages for clustering.")
+    df = df.iloc[:tfidf_matrix.shape[0]].copy()
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+    clusters = kmeans.fit_predict(tfidf_matrix)
+    df['cluster'] = clusters
+    tsne = TSNE(n_components=2, random_state=42)
+    reduced_features = tsne.fit_transform(tfidf_matrix.toarray())
+    return df, reduced_features, kmeans.cluster_centers_
+def predict_sentiment_batch(texts: list, batch_size: int = 32) -> list:
+    """Predict sentiment for a batch of texts"""
+    if not isinstance(texts, list):
+        raise TypeError(f"Expected list of texts, got {type(texts)}")
+    processed_texts = [preprocess(text) for text in texts]
+    predictions = []
+    for i in range(0, len(processed_texts), batch_size):
+        batch = processed_texts[i:i+batch_size]
+        inputs = tokenizer(
+            batch,
+            padding=True,
+            truncation=True,
+            return_tensors="pt",
+            max_length=128
+        ).to(device)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        batch_preds = outputs.logits.argmax(dim=1).cpu().numpy()
+        predictions.extend([config.id2label[p] for p in batch_preds])
+    return predictions

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+streamlit
+preprocessor
+matplotlib
+seaborn
+urlextract
+wordcloud
+pandas
+emoji
+langdetect
+tiktoken
+googletrans
+transformers==4.44.2
+torch==2.4.0
+sentencepiece==0.2.0
+protobuf==5.28.0
+scikit-learn
+plotly
+nltk
+spacy==3.7.0
+thinc>=8.1.8,<8.3.0
+deep_translator
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl
+https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl

sentiment.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import pandas as pd
+import time
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
+# Use a sentiment-specific model (replace with TinyBERT if fine-tuned)
+MODEL = "tabularisai/multilingual-sentiment-analysis"  # Pre-trained for positive/negative sentiment
+print("Loading model and tokenizer...")
+start_load = time.time()
+# Check for MPS (Metal) availability on M2 chip, fallback to CPU
+device = "mps" if torch.backends.mps.is_available() else "cpu"
+print(f"Using device: {device}")
+# Load with optimizations (only once, removing redundancy)
+tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
+model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)
+config = AutoConfig.from_pretrained(MODEL)
+load_time = time.time() - start_load
+print(f"Model and tokenizer loaded in {load_time:.2f} seconds\n")
+# Optimized preprocessing (unchanged from your code)
+def preprocess(text):
+    if not isinstance(text, str):
+        text = str(text) if not pd.isna(text) else ""
+    new_text = []
+    for t in text.split(" "):
+        t = '@user' if t.startswith('@') and len(t) > 1 else t
+        t = 'http' if t.startswith('http') else t
+        new_text.append(t)
+    return " ".join(new_text)
+# Batch prediction function (optimized for performance)
+def predict_sentiment_batch(texts: list, batch_size: int = 16) -> list:
+    if not isinstance(texts, list):
+        raise TypeError(f"Expected list of texts, got {type(texts)}")
+    # Validate and clean inputs
+    valid_texts = [str(text) for text in texts if isinstance(text, str) and text.strip()]
+    if not valid_texts:
+        return []  # Return empty list if no valid texts
+    print(f"Processing {len(valid_texts)} valid samples...")
+    processed_texts = [preprocess(text) for text in valid_texts]
+    predictions = []
+    for i in range(0, len(processed_texts), batch_size):
+        batch = processed_texts[i:i + batch_size]
+        try:
+            inputs = tokenizer(
+                batch,
+                padding=True,
+                truncation=True,
+                return_tensors="pt",
+                max_length=64  # Reduced for speed on short texts like tweets
+            ).to(device)
+            with torch.no_grad():
+                outputs = model(**inputs)
+            batch_preds = outputs.logits.argmax(dim=1).cpu().numpy()
+            predictions.extend([config.id2label[p] for p in batch_preds])
+        except Exception as e:
+            print(f"Error processing batch {i // batch_size}: {str(e)}")
+            predictions.extend(["neutral"] * len(batch))  # Consider logging instead
+    print(f"Predictions for {len(valid_texts)} samples generated in {time.time() - start_load:.2f} seconds")
+    predictions = [prediction.lower().replace("very ", "") for prediction in predictions]
+    print(predictions)
+    return predictions
+# # Example usage with your dataset (uncomment and adjust paths)
+# test_data = pd.read_csv("/Users/caasidev/development/AI/last try/Whatssap-project/srcs/tweets.csv")
+# print(f"Processing {len(test_data)} samples...")
+# start_prediction = time.time()
+# text_samples = test_data['text'].tolist()
+# test_data['predicted_sentiment'] = predict_sentiment_batch(text_samples)
+# prediction_time = time.time() - start_prediction
+# time_per_sample = prediction_time / len(test_data)
+# # Print runtime statistics
+# print("\nRuntime Statistics:")
+# print(f"- Model loading time: {load_time:.2f} seconds")
+# print(f"- Total prediction time for {len(test_data)} samples: {prediction_time:.2f} seconds")
+# print(f"- Average time per sample: {time_per_sample:.4f} seconds")
+# print(f"- Estimated time for 1000 samples: {(time_per_sample * 1000):.2f} seconds")
+# print(f"- Estimated time for 20000 samples: {(time_per_sample * 20000 / 60):.2f} minutes")
+# # Print a sample of predictions
+# print("\nPredicted Sentiments (first 5 samples):")
+# print(test_data[['text', 'predicted_sentiment']].head())