Spaces:

hansche
/

SocialMediaFoci

Running

File size: 21,388 Bytes

d5ba1b1

import streamlit as st
st.set_page_config(page_title="WhatsApp Chat Analyzer", layout="wide")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import preprocessor, helper
from sentiment import predict_sentiment_batch
import os
os.environ["STREAMLIT_SERVER_RUN_ON_SAVE"] = "false"

# Theme customization
st.markdown(
    """
    <style>
    .main {background-color: #f0f2f6;}
    </style>
    """,
    unsafe_allow_html=True
)

# Set seaborn style
sns.set_theme(style="whitegrid")

st.title("📊 WhatsApp Chat Sentiment Analysis Dashboard")
st.subheader('Instructions')
st.markdown("1. Open the sidebar and upload your WhatsApp chat file in .txt format.")
st.markdown("2. Wait for the initial processing (minimal delay).")
st.markdown("3. Customize the analysis by selecting users or filters.")
st.markdown("4. Click 'Show Analysis' for detailed results.")

st.sidebar.title("Whatsapp Chat Analyzer")
uploaded_file = st.sidebar.file_uploader("Upload your chat file (.txt)", type="txt")

@st.cache_data
def load_and_preprocess(file_content):
    return preprocessor.preprocess(file_content)

if uploaded_file is not None:
    raw_data = uploaded_file.read().decode("utf-8")
    with st.spinner("Loading chat data..."):
        df, _ = load_and_preprocess(raw_data)
    st.session_state.df = df

    st.sidebar.header("🔍 Filters")
    user_list = ["Overall"] + sorted(df["user"].unique().tolist())
    selected_user = st.sidebar.selectbox("Select User", user_list)

    df_filtered = df if selected_user == "Overall" else df[df["user"] == selected_user]

    if st.sidebar.button("Show Analysis"):
        if df_filtered.empty:
            st.warning(f"No data found for user: {selected_user}")
        else:
            with st.spinner("Analyzing..."):
                if 'sentiment' not in df_filtered.columns:
                    try:
                        print("Starting sentiment analysis...")
                        # Get messages as clean strings
                        message_list = df_filtered["message"].astype(str).tolist()
                        message_list = [msg for msg in message_list if msg.strip()]
                        
                        print(f"Processing {len(message_list)} messages")
                        print(f"Sample messages: {message_list[:5]}")
                        
                        # Directly call the sentiment analysis function
                        df_filtered['sentiment'] = predict_sentiment_batch(message_list)
                        print("Sentiment analysis completed successfully")
                        
                    except Exception as e:
                        st.error(f"Sentiment analysis failed: {str(e)}")
                        print(f"Full error: {str(e)}")
                    
                    st.session_state.df_filtered = df_filtered
                else:
                    st.session_state.df_filtered = df_filtered

                # Display statistics and visualizations
                num_messages, words, num_media, num_links = helper.fetch_stats(selected_user, df_filtered)
                st.title("Top Statistics")
                col1, col2, col3, col4 = st.columns(4)
                with col1:
                    st.header("Total Messages")
                    st.title(num_messages)
                with col2:
                    st.header("Total Words")
                    st.title(words)
                with col3:
                    st.header("Media Shared")
                    st.title(num_media)
                with col4:
                    st.header("Links Shared")
                    st.title(num_links)

                st.title("Monthly Timeline")
                timeline = helper.monthly_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered))))
                if not timeline.empty:
                    plt.figure(figsize=(10, 5))
                    sns.lineplot(data=timeline, x='time', y='message', color='green')
                    plt.title("Monthly Timeline")
                    plt.xlabel("Date")
                    plt.ylabel("Messages")
                    st.pyplot(plt)
                    plt.clf()

                st.title("Daily Timeline")
                daily_timeline = helper.daily_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered))))
                if not daily_timeline.empty:
                    plt.figure(figsize=(10, 5))
                    sns.lineplot(data=daily_timeline, x='date', y='message', color='black')
                    plt.title("Daily Timeline")
                    plt.xlabel("Date")
                    plt.ylabel("Messages")
                    st.pyplot(plt)
                    plt.clf()

                st.title("Activity Map")
                col1, col2 = st.columns(2)
                with col1:
                    st.header("Most Busy Day")
                    busy_day = helper.week_activity_map(selected_user, df_filtered)
                    if not busy_day.empty:
                        plt.figure(figsize=(10, 5))
                        sns.barplot(x=busy_day.index, y=busy_day.values, palette="Purples_r")
                        plt.title("Most Busy Day")
                        plt.xlabel("Day of Week")
                        plt.ylabel("Message Count")
                        st.pyplot(plt)
                        plt.clf()
                with col2:
                    st.header("Most Busy Month")
                    busy_month = helper.month_activity_map(selected_user, df_filtered)
                    if not busy_month.empty:
                        plt.figure(figsize=(10, 5))
                        sns.barplot(x=busy_month.index, y=busy_month.values, palette="Oranges_r")
                        plt.title("Most Busy Month")
                        plt.xlabel("Month")
                        plt.ylabel("Message Count")
                        st.pyplot(plt)
                        plt.clf()

                if selected_user == 'Overall':
                    st.title("Most Busy Users")
                    x, new_df = helper.most_busy_users(df_filtered)
                    if not x.empty:
                        plt.figure(figsize=(10, 5))
                        sns.barplot(x=x.index, y=x.values, palette="Reds_r")
                        plt.title("Most Busy Users")
                        plt.xlabel("User")
                        plt.ylabel("Message Count")
                        plt.xticks(rotation=45)
                        st.pyplot(plt)
                        st.title("Word Count by User")
                        plt.clf()
                        st.dataframe(new_df)
                
                # Most common words analysis
                st.title("Most Common Words")
                most_common_df = helper.most_common_words(selected_user, df_filtered)
                if not most_common_df.empty:
                    fig, ax = plt.subplots(figsize=(10, 6))
                    sns.barplot(y=most_common_df[0], x=most_common_df[1], ax=ax, palette="Blues_r")
                    ax.set_title("Top 20 Most Common Words")
                    ax.set_xlabel("Frequency")
                    ax.set_ylabel("Words")
                    plt.xticks(rotation='vertical')
                    st.pyplot(fig)
                    plt.clf()
                else:
                    st.warning("No data available for most common words.")

                # Emoji analysis
                st.title("Emoji Analysis")
                emoji_df = helper.emoji_helper(selected_user, df_filtered)
                if not emoji_df.empty:
                    col1, col2 = st.columns(2)

                    with col1:
                        st.subheader("Top Emojis Used")
                        st.dataframe(emoji_df)
                    
                    with col2:
                        fig, ax = plt.subplots(figsize=(8, 8))
                        ax.pie(emoji_df[1].head(), labels=emoji_df[0].head(), 
                              autopct="%0.2f%%", startangle=90,
                              colors=sns.color_palette("pastel"))
                        ax.set_title("Top Emoji Distribution")
                        st.pyplot(fig)
                        plt.clf()
                else:
                    st.warning("No data available for emoji analysis.")
                
                # Sentiment Analysis Visualizations
                st.title("📈 Sentiment Analysis")
                
                # Convert month names to abbreviated format
                month_map = {
                    'January': 'Jan', 'February': 'Feb', 'March': 'Mar', 'April': 'Apr',
                    'May': 'May', 'June': 'Jun', 'July': 'Jul', 'August': 'Aug',
                    'September': 'Sep', 'October': 'Oct', 'November': 'Nov', 'December': 'Dec'
                }
                df_filtered['month'] = df_filtered['month'].map(month_map)

                # Group by month and sentiment
                monthly_sentiment = df_filtered.groupby(['month', 'sentiment']).size().unstack(fill_value=0)

                # Plotting: Histogram (Bar Chart) for each sentiment
                st.write("### Sentiment Count by Month (Histogram)")

                # Create a figure with subplots for each sentiment
                fig, axes = plt.subplots(1, 3, figsize=(18, 5))

                # Plot Positive Sentiment
                if 'positive' in monthly_sentiment:
                    axes[0].bar(monthly_sentiment.index, monthly_sentiment['positive'], color='green')
                axes[0].set_title('Positive Sentiment')
                axes[0].set_xlabel('Month')
                axes[0].set_ylabel('Count')

                # Plot Neutral Sentiment
                if 'neutral' in monthly_sentiment:
                    axes[1].bar(monthly_sentiment.index, monthly_sentiment['neutral'], color='blue')
                axes[1].set_title('Neutral Sentiment')
                axes[1].set_xlabel('Month')
                axes[1].set_ylabel('Count')

                # Plot Negative Sentiment
                if 'negative' in monthly_sentiment:
                    axes[2].bar(monthly_sentiment.index, monthly_sentiment['negative'], color='red')
                axes[2].set_title('Negative Sentiment')
                axes[2].set_xlabel('Month')
                axes[2].set_ylabel('Count')

                # Display the plots in Streamlit
                st.pyplot(fig)
                plt.clf()

                # Count sentiments per day of the week
                sentiment_counts = df_filtered.groupby(['day_of_week', 'sentiment']).size().unstack(fill_value=0)

                # Sort days correctly
                day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
                sentiment_counts = sentiment_counts.reindex(day_order)

                # Daily Sentiment Analysis
                st.write("### Daily Sentiment Analysis")

                # Create a Matplotlib figure
                fig, ax = plt.subplots(figsize=(10, 5))
                sentiment_counts.plot(kind='bar', stacked=False, ax=ax, color=['red', 'blue', 'green'])

                # Customize the plot
                ax.set_xlabel("Day of the Week")
                ax.set_ylabel("Count")
                ax.set_title("Sentiment Distribution per Day of the Week")
                ax.legend(title="Sentiment")

                # Display the plot in Streamlit
                st.pyplot(fig)
                plt.clf()

                # Count messages per user per sentiment (only for Overall view)
                if selected_user == 'Overall':
                    sentiment_counts = df_filtered.groupby(['user', 'sentiment']).size().reset_index(name='Count')

                    # Calculate total messages per sentiment
                    total_per_sentiment = df_filtered['sentiment'].value_counts().to_dict()

                    # Add percentage column
                    sentiment_counts['Percentage'] = sentiment_counts.apply(
                        lambda row: (row['Count'] / total_per_sentiment[row['sentiment']]) * 100, axis=1
                    )

                    # Separate tables for each sentiment
                    positive_df = sentiment_counts[sentiment_counts['sentiment'] == 'positive'].sort_values(by='Count', ascending=False).head(10)
                    neutral_df = sentiment_counts[sentiment_counts['sentiment'] == 'neutral'].sort_values(by='Count', ascending=False).head(10)
                    negative_df = sentiment_counts[sentiment_counts['sentiment'] == 'negative'].sort_values(by='Count', ascending=False).head(10)

                    # Sentiment Contribution Analysis
                    st.write("### Sentiment Contribution by User")

                    # Create three columns for side-by-side display
                    col1, col2, col3 = st.columns(3)

                    # Display Positive Table
                    with col1:
                        st.subheader("Top Positive Contributors")
                        if not positive_df.empty:
                            st.dataframe(positive_df[['user', 'Count', 'Percentage']])
                        else:
                            st.warning("No positive sentiment data")

                    # Display Neutral Table
                    with col2:
                        st.subheader("Top Neutral Contributors")
                        if not neutral_df.empty:
                            st.dataframe(neutral_df[['user', 'Count', 'Percentage']])
                        else:
                            st.warning("No neutral sentiment data")

                    # Display Negative Table
                    with col3:
                        st.subheader("Top Negative Contributors")
                        if not negative_df.empty:
                            st.dataframe(negative_df[['user', 'Count', 'Percentage']])
                        else:
                            st.warning("No negative sentiment data")

                             # Topic Analysis Section
                st.title("🔍 Area of Focus: Topic Analysis")
                
                # Check if topic column exists, otherwise perform topic modeling
                # if 'topic' not in df_filtered.columns:
                #     with st.spinner("Performing topic modeling..."):
                #         try:
                #             # Add topic modeling here or ensure your helper functions handle it
                #             df_filtered = helper.perform_topic_modeling(df_filtered)
                #         except Exception as e:
                #             st.error(f"Topic modeling failed: {str(e)}")
                #             st.stop()
                
                # Plot Topic Distribution
                st.header("Topic Distribution")
                try:
                    fig = helper.plot_topic_distribution(df_filtered)
                    st.pyplot(fig)
                    plt.clf()
                except Exception as e:
                    st.warning(f"Could not display topic distribution: {str(e)}")

                # Display Sample Messages for Each Topic
                st.header("Sample Messages for Each Topic")
                if 'topic' in df_filtered.columns:
                    for topic_id in sorted(df_filtered['topic'].unique()):
                        st.subheader(f"Topic {topic_id}")
                        
                        # Get messages for the current topic
                        filtered_messages = df_filtered[df_filtered['topic'] == topic_id]['message']
                        
                        # Determine sample size
                        sample_size = min(5, len(filtered_messages))
                        
                        if sample_size > 0:
                            sample_messages = filtered_messages.sample(sample_size, replace=False).tolist()
                            for msg in sample_messages:
                                st.write(f"- {msg}")
                        else:
                            st.write("No messages available for this topic.")
                else:
                    st.warning("Topic information not available")

                # Topic Distribution Over Time
                st.header("📅 Topic Trends Over Time")
                
                # Add time frequency selector
                time_freq = st.selectbox("Select Time Frequency", ["Daily", "Weekly", "Monthly"], key='time_freq')
                
                # Plot topic trends
                try:
                    freq_map = {"Daily": "D", "Weekly": "W", "Monthly": "M"}
                    topic_distribution = helper.topic_distribution_over_time(df_filtered, time_freq=freq_map[time_freq])
                    
                    # Choose between static and interactive plot
                    use_plotly = st.checkbox("Use interactive visualization", value=True, key='use_plotly')
                    
                    if use_plotly:
                        fig = helper.plot_topic_distribution_over_time_plotly(topic_distribution)
                        st.plotly_chart(fig, use_container_width=True)
                    else:
                        fig = helper.plot_topic_distribution_over_time(topic_distribution)
                        st.pyplot(fig)
                        plt.clf()
                except Exception as e:
                    st.warning(f"Could not display topic trends: {str(e)}")

                # Clustering Analysis Section
                st.title("🧩 Conversation Clusters")
                
                # Number of clusters input
                n_clusters = st.slider("Select number of clusters", 
                                       min_value=2, 
                                       max_value=10, 
                                       value=5,
                                       key='n_clusters')
                
                # Perform clustering
                with st.spinner("Analyzing conversation clusters..."):
                    try:
                        df_clustered, reduced_features, _ = preprocessor.preprocess_for_clustering(df_filtered, n_clusters=n_clusters)
                        
                        # Plot clusters
                        st.header("Cluster Visualization")
                        fig = helper.plot_clusters(reduced_features, df_clustered['cluster'])
                        st.pyplot(fig)
                        plt.clf()
                        
                        # Cluster Insights
                        st.header("📌 Cluster Insights")
                        
                        # 1. Dominant Conversation Themes
                        st.subheader("1. Dominant Themes")
                        cluster_labels = helper.get_cluster_labels(df_clustered, n_clusters)
                        for cluster_id, label in cluster_labels.items():
                            st.write(f"**Cluster {cluster_id}**: {label}")
                        
                        # 2. Temporal Patterns
                        st.subheader("2. Temporal Patterns")
                        temporal_trends = helper.get_temporal_trends(df_clustered)
                        for cluster_id, trend in temporal_trends.items():
                            st.write(f"**Cluster {cluster_id}**: Peaks on {trend['peak_day']} around {trend['peak_time']}")
                        
                        # 3. User Contributions
                        if selected_user == 'Overall':
                            st.subheader("3. Top Contributors")
                            user_contributions = helper.get_user_contributions(df_clustered)
                            for cluster_id, users in user_contributions.items():
                                st.write(f"**Cluster {cluster_id}**: {', '.join(users[:3])}...")
                        
                        # 4. Sentiment by Cluster
                        st.subheader("4. Sentiment Analysis")
                        sentiment_by_cluster = helper.get_sentiment_by_cluster(df_clustered)
                        for cluster_id, sentiment in sentiment_by_cluster.items():
                            st.write(f"**Cluster {cluster_id}**: {sentiment['positive']}% positive, {sentiment['neutral']}% neutral, {sentiment['negative']}% negative")
                        
                        # Sample messages from each cluster
                        st.subheader("Sample Messages")
                        for cluster_id in sorted(df_clustered['cluster'].unique()):
                            with st.expander(f"Cluster {cluster_id} Messages"):
                                cluster_msgs = df_clustered[df_clustered['cluster'] == cluster_id]['message']
                                sample_size = min(3, len(cluster_msgs))
                                if sample_size > 0:
                                    for msg in cluster_msgs.sample(sample_size, replace=False):
                                        st.write(f"- {msg}")
                                else:
                                    st.write("No messages available")
                        
                    except Exception as e:
                        st.error(f"Clustering failed: {str(e)}")