|
from urlextract import URLExtract |
|
from wordcloud import WordCloud |
|
import pandas as pd |
|
from collections import Counter |
|
import emoji |
|
import plotly.express as px |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
extract = URLExtract() |
|
|
|
def fetch_stats(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
num_messages = df.shape[0] |
|
words = sum(len(msg.split()) for msg in df['message']) |
|
num_media_messages = df[df['unfiltered_messages'] == '<media omitted>\n'].shape[0] |
|
links = sum(len(extract.find_urls(msg)) for msg in df['unfiltered_messages']) |
|
return num_messages, words, num_media_messages, links |
|
|
|
def most_busy_users(df): |
|
x = df['user'].value_counts().head() |
|
df = round((df['user'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename( |
|
columns={'index': 'percentage', 'user': 'Name'}) |
|
return x, df |
|
|
|
def create_wordcloud(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
temp = df[df['user'] != 'group_notification'] |
|
temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')] |
|
wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white') |
|
df_wc = wc.generate(temp['message'].str.cat(sep=" ")) |
|
return df_wc |
|
|
|
def most_common_words(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
temp = df[df['user'] != 'group_notification'] |
|
temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')] |
|
words = [word for msg in temp['message'] for word in msg.lower().split()] |
|
return pd.DataFrame(Counter(words).most_common(20)) |
|
|
|
def emoji_helper(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
emojis = [c for msg in df['unfiltered_messages'] for c in msg if c in emoji.EMOJI_DATA] |
|
return pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis)))) |
|
|
|
def monthly_timeline(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
timeline = df.groupby(['year', 'month']).count()['message'].reset_index() |
|
timeline['time'] = timeline['month'] + "-" + timeline['year'].astype(str) |
|
return timeline |
|
|
|
def daily_timeline(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
return df.groupby('date').count()['message'].reset_index() |
|
|
|
def week_activity_map(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
return df['day_of_week'].value_counts() |
|
|
|
def month_activity_map(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
return df['month'].value_counts() |
|
|
|
def plot_topic_distribution(df): |
|
topic_counts = df['topic'].value_counts().sort_index() |
|
fig = px.bar(x=topic_counts.index, y=topic_counts.values, title="Topic Distribution", color_discrete_sequence=['viridis']) |
|
return fig |
|
|
|
def topic_distribution_over_time(df, time_freq='M'): |
|
df['time_period'] = df['date'].dt.to_period(time_freq) |
|
return df.groupby(['time_period', 'topic']).size().unstack(fill_value=0) |
|
|
|
def plot_topic_distribution_over_time_plotly(topic_distribution): |
|
topic_distribution = topic_distribution.reset_index() |
|
topic_distribution['time_period'] = topic_distribution['time_period'].dt.to_timestamp() |
|
topic_distribution = topic_distribution.melt(id_vars='time_period', var_name='topic', value_name='count') |
|
fig = px.line(topic_distribution, x='time_period', y='count', color='topic', title="Topic Distribution Over Time") |
|
fig.update_layout(legend_title_text='Topics', xaxis_tickangle=-45) |
|
return fig |
|
|
|
def plot_clusters(reduced_features, clusters): |
|
fig = px.scatter(x=reduced_features[:, 0], y=reduced_features[:, 1], color=clusters, title="Message Clusters (t-SNE)") |
|
return fig |
|
def most_common_words(selected_user, df): |
|
|
|
stop_words = df |
|
|
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
|
|
temp = df[df['user'] != 'group_notification'] |
|
temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')] |
|
|
|
words = [] |
|
|
|
for message in temp['message']: |
|
for word in message.lower().split(): |
|
if word not in stop_words: |
|
words.append(word) |
|
|
|
most_common_df = pd.DataFrame(Counter(words).most_common(20)) |
|
return most_common_df |
|
|
|
def emoji_helper(selected_user, df): |
|
if selected_user != 'Overall': |
|
df = df[df['user'] == selected_user] |
|
|
|
emojis = [] |
|
for message in df['unfiltered_messages']: |
|
emojis.extend([c for c in message if c in emoji.EMOJI_DATA]) |
|
|
|
emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis)))) |
|
|
|
return emoji_df |
|
def plot_topic_distribution(df): |
|
""" |
|
Plots the distribution of topics in the chat data. |
|
""" |
|
topic_counts = df['topic'].value_counts().sort_index() |
|
fig, ax = plt.subplots() |
|
sns.barplot(x=topic_counts.index, y=topic_counts.values, ax=ax, palette="viridis") |
|
ax.set_title("Topic Distribution") |
|
ax.set_xlabel("Topic") |
|
ax.set_ylabel("Number of Messages") |
|
return fig |
|
|
|
def most_frequent_keywords(messages, top_n=10): |
|
""" |
|
Extracts the most frequent keywords from a list of messages. |
|
""" |
|
words = [word for msg in messages for word in msg.split()] |
|
word_freq = Counter(words) |
|
return word_freq.most_common(top_n) |
|
def plot_topic_distribution_over_time(topic_distribution): |
|
""" |
|
Plots the distribution of topics over time using a line chart. |
|
""" |
|
fig, ax = plt.subplots(figsize=(12, 6)) |
|
|
|
|
|
for topic in topic_distribution.columns: |
|
ax.plot(topic_distribution.index.to_timestamp(), topic_distribution[topic], label=f"Topic {topic}") |
|
|
|
ax.set_title("Topic Distribution Over Time") |
|
ax.set_xlabel("Time Period") |
|
ax.set_ylabel("Number of Messages") |
|
ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left') |
|
plt.xticks(rotation=45) |
|
plt.tight_layout() |
|
return fig |
|
|
|
def plot_most_frequent_keywords(keywords): |
|
""" |
|
Plots the most frequent keywords. |
|
""" |
|
words, counts = zip(*keywords) |
|
fig, ax = plt.subplots() |
|
sns.barplot(x=list(counts), y=list(words), ax=ax, palette="viridis") |
|
ax.set_title("Most Frequent Keywords") |
|
ax.set_xlabel("Frequency") |
|
ax.set_ylabel("Keyword") |
|
return fig |
|
def topic_distribution_over_time(df, time_freq='M'): |
|
""" |
|
Analyzes the distribution of topics over time. |
|
""" |
|
|
|
df['time_period'] = df['date'].dt.to_period(time_freq) |
|
topic_distribution = df.groupby(['time_period', 'topic']).size().unstack(fill_value=0) |
|
return topic_distribution |
|
|
|
def plot_topic_distribution_over_time(topic_distribution): |
|
""" |
|
Plots the distribution of topics over time using a line chart. |
|
""" |
|
fig, ax = plt.subplots(figsize=(12, 6)) |
|
|
|
|
|
for topic in topic_distribution.columns: |
|
ax.plot(topic_distribution.index.to_timestamp(), topic_distribution[topic], label=f"Topic {topic}") |
|
|
|
ax.set_title("Topic Distribution Over Time") |
|
ax.set_xlabel("Time Period") |
|
ax.set_ylabel("Number of Messages") |
|
ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left') |
|
plt.xticks(rotation=45) |
|
plt.tight_layout() |
|
return fig |
|
|
|
def plot_topic_distribution_over_time_plotly(topic_distribution): |
|
""" |
|
Plots the distribution of topics over time using Plotly. |
|
""" |
|
topic_distribution = topic_distribution.reset_index() |
|
topic_distribution['time_period'] = topic_distribution['time_period'].dt.to_timestamp() |
|
topic_distribution = topic_distribution.melt(id_vars='time_period', var_name='topic', value_name='count') |
|
|
|
fig = px.line(topic_distribution, x='time_period', y='count', color='topic', |
|
title="Topic Distribution Over Time", labels={'time_period': 'Time Period', 'count': 'Number of Messages'}) |
|
fig.update_layout(legend_title_text='Topics', xaxis_tickangle=-45) |
|
return fig |
|
def plot_clusters(reduced_features, clusters): |
|
""" |
|
Visualize clusters using t-SNE. |
|
Args: |
|
reduced_features (np.array): 2D array of reduced features. |
|
clusters (np.array): Cluster labels. |
|
Returns: |
|
fig (plt.Figure): Matplotlib figure object. |
|
""" |
|
plt.figure(figsize=(10, 8)) |
|
sns.scatterplot( |
|
x=reduced_features[:, 0], |
|
y=reduced_features[:, 1], |
|
hue=clusters, |
|
palette="viridis", |
|
legend="full" |
|
) |
|
plt.title("Message Clusters (t-SNE Visualization)") |
|
plt.xlabel("t-SNE Component 1") |
|
plt.ylabel("t-SNE Component 2") |
|
plt.tight_layout() |
|
return plt.gcf() |
|
def get_cluster_labels(df, n_clusters): |
|
""" |
|
Generate descriptive labels for each cluster based on top keywords. |
|
""" |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
import numpy as np |
|
|
|
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') |
|
tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message']) |
|
|
|
cluster_labels = {} |
|
for cluster_id in range(n_clusters): |
|
cluster_indices = df[df['cluster'] == cluster_id].index |
|
if len(cluster_indices) > 0: |
|
cluster_tfidf = tfidf_matrix[cluster_indices] |
|
top_keywords = np.argsort(cluster_tfidf.sum(axis=0).A1)[-3:][::-1] |
|
cluster_labels[cluster_id] = ", ".join(vectorizer.get_feature_names_out()[top_keywords]) |
|
else: |
|
cluster_labels[cluster_id] = "No dominant theme" |
|
return cluster_labels |
|
|
|
def get_temporal_trends(df): |
|
""" |
|
Analyze temporal trends for each cluster (peak day and time). |
|
""" |
|
temporal_trends = {} |
|
for cluster_id in df['cluster'].unique(): |
|
cluster_data = df[df['cluster'] == cluster_id] |
|
if not cluster_data.empty: |
|
peak_day = cluster_data['day_of_week'].mode()[0] |
|
peak_time = cluster_data['hour'].mode()[0] |
|
temporal_trends[cluster_id] = {"peak_day": peak_day, "peak_time": f"{peak_time}:00"} |
|
return temporal_trends |
|
|
|
def get_user_contributions(df): |
|
""" |
|
Identify top contributors for each cluster. |
|
""" |
|
user_contributions = {} |
|
for cluster_id in df['cluster'].unique(): |
|
cluster_data = df[df['cluster'] == cluster_id] |
|
if not cluster_data.empty: |
|
top_users = cluster_data['user'].value_counts().head(3).index.tolist() |
|
user_contributions[cluster_id] = top_users |
|
return user_contributions |
|
|
|
def get_sentiment_by_cluster(df): |
|
""" |
|
Analyze sentiment distribution for each cluster. |
|
""" |
|
sentiment_by_cluster = {} |
|
for cluster_id in df['cluster'].unique(): |
|
cluster_data = df[df['cluster'] == cluster_id] |
|
if not cluster_data.empty: |
|
sentiment_counts = cluster_data['sentiment'].value_counts(normalize=True) * 100 |
|
sentiment_by_cluster[cluster_id] = { |
|
"positive": round(sentiment_counts.get('positive', 0)), |
|
"neutral": round(sentiment_counts.get('neutral', 0)), |
|
"negative": round(sentiment_counts.get('negative', 0)) |
|
} |
|
return sentiment_by_cluster |
|
|
|
def detect_anomalies(df): |
|
""" |
|
Detect anomalies in each cluster (e.g., high link or media share). |
|
""" |
|
anomalies = {} |
|
for cluster_id in df['cluster'].unique(): |
|
cluster_data = df[df['cluster'] == cluster_id] |
|
if not cluster_data.empty: |
|
link_share = (cluster_data['message'].str.contains('http').mean()) * 100 |
|
media_share = (cluster_data['message'].str.contains('<media omitted>').mean()) * 100 |
|
if link_share > 50: |
|
anomalies[cluster_id] = f"{round(link_share)}% of messages contain links." |
|
elif media_share > 50: |
|
anomalies[cluster_id] = f"{round(media_share)}% of messages are media files." |
|
return anomalies |
|
|
|
def generate_recommendations(df): |
|
""" |
|
Generate actionable recommendations based on cluster insights. |
|
""" |
|
recommendations = [] |
|
for cluster_id in df['cluster'].unique(): |
|
cluster_data = df[df['cluster'] == cluster_id] |
|
if not cluster_data.empty: |
|
sentiment_counts = cluster_data['sentiment'].value_counts(normalize=True) * 100 |
|
if sentiment_counts.get('negative', 0) > 50: |
|
recommendations.append(f"Address negative sentiment in Cluster {cluster_id} by revisiting feedback processes.") |
|
if cluster_data['message'].str.contains('http').mean() > 0.5: |
|
recommendations.append(f"Pin resources from Cluster {cluster_id} (most-shared links) for easy access.") |
|
return recommendations |