Spaces:
Running
Running
from urlextract import URLExtract | |
from wordcloud import WordCloud | |
import pandas as pd | |
from collections import Counter | |
import emoji | |
import plotly.express as px | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
extract = URLExtract() | |
def fetch_stats(selected_user, df): | |
if selected_user != 'Overall': | |
df = df[df['user'] == selected_user] | |
num_messages = df.shape[0] | |
words = sum(len(msg.split()) for msg in df['message']) | |
num_media_messages = df[df['unfiltered_messages'] == '<media omitted>\n'].shape[0] | |
links = sum(len(extract.find_urls(msg)) for msg in df['unfiltered_messages']) | |
return num_messages, words, num_media_messages, links | |
def most_busy_users(df): | |
x = df['user'].value_counts().head() | |
df = round((df['user'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename( | |
columns={'index': 'percentage', 'user': 'Name'}) | |
return x, df | |
def create_wordcloud(selected_user, df): | |
if selected_user != 'Overall': | |
df = df[df['user'] == selected_user] | |
temp = df[df['user'] != 'group_notification'] | |
temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')] | |
wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white') | |
df_wc = wc.generate(temp['message'].str.cat(sep=" ")) | |
return df_wc | |
def most_common_words(selected_user, df): | |
if selected_user != 'Overall': | |
df = df[df['user'] == selected_user] | |
temp = df[df['user'] != 'group_notification'] | |
temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')] | |
words = [word for msg in temp['message'] for word in msg.lower().split()] | |
return pd.DataFrame(Counter(words).most_common(20)) | |
def emoji_helper(selected_user, df): | |
if selected_user != 'Overall': | |
df = df[df['user'] == selected_user] | |
emojis = [c for msg in df['unfiltered_messages'] for c in msg if c in emoji.EMOJI_DATA] | |
return pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis)))) | |
def monthly_timeline(selected_user, df): | |
if selected_user != 'Overall': | |
df = df[df['user'] == selected_user] | |
timeline = df.groupby(['year', 'month']).count()['message'].reset_index() | |
timeline['time'] = timeline['month'] + "-" + timeline['year'].astype(str) | |
return timeline | |
def daily_timeline(selected_user, df): | |
if selected_user != 'Overall': | |
df = df[df['user'] == selected_user] | |
return df.groupby('date').count()['message'].reset_index() | |
def week_activity_map(selected_user, df): | |
if selected_user != 'Overall': | |
df = df[df['user'] == selected_user] | |
return df['day_of_week'].value_counts() | |
def month_activity_map(selected_user, df): | |
if selected_user != 'Overall': | |
df = df[df['user'] == selected_user] | |
return df['month'].value_counts() | |
def plot_topic_distribution(df): | |
topic_counts = df['topic'].value_counts().sort_index() | |
fig = px.bar(x=topic_counts.index, y=topic_counts.values, title="Topic Distribution", color_discrete_sequence=['viridis']) | |
return fig | |
def topic_distribution_over_time(df, time_freq='M'): | |
df['time_period'] = df['date'].dt.to_period(time_freq) | |
return df.groupby(['time_period', 'topic']).size().unstack(fill_value=0) | |
def plot_topic_distribution_over_time_plotly(topic_distribution): | |
topic_distribution = topic_distribution.reset_index() | |
topic_distribution['time_period'] = topic_distribution['time_period'].dt.to_timestamp() | |
topic_distribution = topic_distribution.melt(id_vars='time_period', var_name='topic', value_name='count') | |
fig = px.line(topic_distribution, x='time_period', y='count', color='topic', title="Topic Distribution Over Time") | |
fig.update_layout(legend_title_text='Topics', xaxis_tickangle=-45) | |
return fig | |
def plot_clusters(reduced_features, clusters): | |
fig = px.scatter(x=reduced_features[:, 0], y=reduced_features[:, 1], color=clusters, title="Message Clusters (t-SNE)") | |
return fig | |
def most_common_words(selected_user, df): | |
# f = open('stop_hinglish.txt','r') | |
stop_words = df | |
if selected_user != 'Overall': | |
df = df[df['user'] == selected_user] | |
temp = df[df['user'] != 'group_notification'] | |
temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')] | |
words = [] | |
for message in temp['message']: | |
for word in message.lower().split(): | |
if word not in stop_words: | |
words.append(word) | |
most_common_df = pd.DataFrame(Counter(words).most_common(20)) | |
return most_common_df | |
def emoji_helper(selected_user, df): | |
if selected_user != 'Overall': | |
df = df[df['user'] == selected_user] | |
emojis = [] | |
for message in df['unfiltered_messages']: | |
emojis.extend([c for c in message if c in emoji.EMOJI_DATA]) | |
emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis)))) | |
return emoji_df | |
def plot_topic_distribution(df): | |
""" | |
Plots the distribution of topics in the chat data. | |
""" | |
topic_counts = df['topic'].value_counts().sort_index() | |
fig, ax = plt.subplots() | |
sns.barplot(x=topic_counts.index, y=topic_counts.values, ax=ax, palette="viridis") | |
ax.set_title("Topic Distribution") | |
ax.set_xlabel("Topic") | |
ax.set_ylabel("Number of Messages") | |
return fig | |
def most_frequent_keywords(messages, top_n=10): | |
""" | |
Extracts the most frequent keywords from a list of messages. | |
""" | |
words = [word for msg in messages for word in msg.split()] | |
word_freq = Counter(words) | |
return word_freq.most_common(top_n) | |
def plot_topic_distribution_over_time(topic_distribution): | |
""" | |
Plots the distribution of topics over time using a line chart. | |
""" | |
fig, ax = plt.subplots(figsize=(12, 6)) | |
# Plot each topic as a separate line | |
for topic in topic_distribution.columns: | |
ax.plot(topic_distribution.index.to_timestamp(), topic_distribution[topic], label=f"Topic {topic}") | |
ax.set_title("Topic Distribution Over Time") | |
ax.set_xlabel("Time Period") | |
ax.set_ylabel("Number of Messages") | |
ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left') | |
plt.xticks(rotation=45) | |
plt.tight_layout() | |
return fig | |
def plot_most_frequent_keywords(keywords): | |
""" | |
Plots the most frequent keywords. | |
""" | |
words, counts = zip(*keywords) | |
fig, ax = plt.subplots() | |
sns.barplot(x=list(counts), y=list(words), ax=ax, palette="viridis") | |
ax.set_title("Most Frequent Keywords") | |
ax.set_xlabel("Frequency") | |
ax.set_ylabel("Keyword") | |
return fig | |
def topic_distribution_over_time(df, time_freq='M'): | |
""" | |
Analyzes the distribution of topics over time. | |
""" | |
# Group by time interval and topic | |
df['time_period'] = df['date'].dt.to_period(time_freq) | |
topic_distribution = df.groupby(['time_period', 'topic']).size().unstack(fill_value=0) | |
return topic_distribution | |
def plot_topic_distribution_over_time(topic_distribution): | |
""" | |
Plots the distribution of topics over time using a line chart. | |
""" | |
fig, ax = plt.subplots(figsize=(12, 6)) | |
# Plot each topic as a separate line | |
for topic in topic_distribution.columns: | |
ax.plot(topic_distribution.index.to_timestamp(), topic_distribution[topic], label=f"Topic {topic}") | |
ax.set_title("Topic Distribution Over Time") | |
ax.set_xlabel("Time Period") | |
ax.set_ylabel("Number of Messages") | |
ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left') | |
plt.xticks(rotation=45) | |
plt.tight_layout() | |
return fig | |
def plot_topic_distribution_over_time_plotly(topic_distribution): | |
""" | |
Plots the distribution of topics over time using Plotly. | |
""" | |
topic_distribution = topic_distribution.reset_index() | |
topic_distribution['time_period'] = topic_distribution['time_period'].dt.to_timestamp() | |
topic_distribution = topic_distribution.melt(id_vars='time_period', var_name='topic', value_name='count') | |
fig = px.line(topic_distribution, x='time_period', y='count', color='topic', | |
title="Topic Distribution Over Time", labels={'time_period': 'Time Period', 'count': 'Number of Messages'}) | |
fig.update_layout(legend_title_text='Topics', xaxis_tickangle=-45) | |
return fig | |
def plot_clusters(reduced_features, clusters): | |
""" | |
Visualize clusters using t-SNE. | |
Args: | |
reduced_features (np.array): 2D array of reduced features. | |
clusters (np.array): Cluster labels. | |
Returns: | |
fig (plt.Figure): Matplotlib figure object. | |
""" | |
plt.figure(figsize=(10, 8)) | |
sns.scatterplot( | |
x=reduced_features[:, 0], | |
y=reduced_features[:, 1], | |
hue=clusters, | |
palette="viridis", | |
legend="full" | |
) | |
plt.title("Message Clusters (t-SNE Visualization)") | |
plt.xlabel("t-SNE Component 1") | |
plt.ylabel("t-SNE Component 2") | |
plt.tight_layout() | |
return plt.gcf() | |
def get_cluster_labels(df, n_clusters): | |
""" | |
Generate descriptive labels for each cluster based on top keywords. | |
""" | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
import numpy as np | |
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') | |
tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message']) | |
cluster_labels = {} | |
for cluster_id in range(n_clusters): | |
cluster_indices = df[df['cluster'] == cluster_id].index | |
if len(cluster_indices) > 0: | |
cluster_tfidf = tfidf_matrix[cluster_indices] | |
top_keywords = np.argsort(cluster_tfidf.sum(axis=0).A1)[-3:][::-1] | |
cluster_labels[cluster_id] = ", ".join(vectorizer.get_feature_names_out()[top_keywords]) | |
else: | |
cluster_labels[cluster_id] = "No dominant theme" | |
return cluster_labels | |
def get_temporal_trends(df): | |
""" | |
Analyze temporal trends for each cluster (peak day and time). | |
""" | |
temporal_trends = {} | |
for cluster_id in df['cluster'].unique(): | |
cluster_data = df[df['cluster'] == cluster_id] | |
if not cluster_data.empty: | |
peak_day = cluster_data['day_of_week'].mode()[0] | |
peak_time = cluster_data['hour'].mode()[0] | |
temporal_trends[cluster_id] = {"peak_day": peak_day, "peak_time": f"{peak_time}:00"} | |
return temporal_trends | |
def get_user_contributions(df): | |
""" | |
Identify top contributors for each cluster. | |
""" | |
user_contributions = {} | |
for cluster_id in df['cluster'].unique(): | |
cluster_data = df[df['cluster'] == cluster_id] | |
if not cluster_data.empty: | |
top_users = cluster_data['user'].value_counts().head(3).index.tolist() | |
user_contributions[cluster_id] = top_users | |
return user_contributions | |
def get_sentiment_by_cluster(df): | |
""" | |
Analyze sentiment distribution for each cluster. | |
""" | |
sentiment_by_cluster = {} | |
for cluster_id in df['cluster'].unique(): | |
cluster_data = df[df['cluster'] == cluster_id] | |
if not cluster_data.empty: | |
sentiment_counts = cluster_data['sentiment'].value_counts(normalize=True) * 100 | |
sentiment_by_cluster[cluster_id] = { | |
"positive": round(sentiment_counts.get('positive', 0)), | |
"neutral": round(sentiment_counts.get('neutral', 0)), | |
"negative": round(sentiment_counts.get('negative', 0)) | |
} | |
return sentiment_by_cluster | |
def detect_anomalies(df): | |
""" | |
Detect anomalies in each cluster (e.g., high link or media share). | |
""" | |
anomalies = {} | |
for cluster_id in df['cluster'].unique(): | |
cluster_data = df[df['cluster'] == cluster_id] | |
if not cluster_data.empty: | |
link_share = (cluster_data['message'].str.contains('http').mean()) * 100 | |
media_share = (cluster_data['message'].str.contains('<media omitted>').mean()) * 100 | |
if link_share > 50: | |
anomalies[cluster_id] = f"{round(link_share)}% of messages contain links." | |
elif media_share > 50: | |
anomalies[cluster_id] = f"{round(media_share)}% of messages are media files." | |
return anomalies | |
def generate_recommendations(df): | |
""" | |
Generate actionable recommendations based on cluster insights. | |
""" | |
recommendations = [] | |
for cluster_id in df['cluster'].unique(): | |
cluster_data = df[df['cluster'] == cluster_id] | |
if not cluster_data.empty: | |
sentiment_counts = cluster_data['sentiment'].value_counts(normalize=True) * 100 | |
if sentiment_counts.get('negative', 0) > 50: | |
recommendations.append(f"Address negative sentiment in Cluster {cluster_id} by revisiting feedback processes.") | |
if cluster_data['message'].str.contains('http').mean() > 0.5: | |
recommendations.append(f"Pin resources from Cluster {cluster_id} (most-shared links) for easy access.") | |
return recommendations |