RatanPrakash
Merge remote-tracking branch 'huggingface/master'
92ada6e
import pandas as pd
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import emoji
import numpy as np
import seaborn as sns
def preprocess_data(chat_data):
date_time_pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s'
messages = re.split(date_time_pattern, chat_data)[1:]
date_times = re.findall(date_time_pattern, chat_data)
df = pd.DataFrame({'date_time': date_times, 'message': messages})
df['date_time'] = df['date_time'].str.replace('\u202f', ' ')
df['date_time'] = pd.to_datetime(df['date_time'], format='%m/%d/%y, %I:%M %p - ')
#separate users and their corresponding messages
users = []
messages = []
for message in df['message']:
entry = re.split('([\w\W]+?):\s', message)
if len(entry) > 1:
users.append(entry[1])
messages.append(entry[2])
else:
users.append('group_notification')
messages.append(entry[0])
df['user'] = users
df['message'] = messages
# extract date month year hour and minute from date_time
df['date'] = df['date_time'].dt.date
df['time'] = df['date_time'].dt.time
df['hour'] = df['date_time'].dt.hour
df['minute'] = df['date_time'].dt.minute
df['day'] = df['date_time'].dt.day
df['month'] = df['date_time'].dt.month
df['year'] = df['date_time'].dt.year
df['weekday'] = df['date_time'].dt.weekday
df['weekday_en'] = df['weekday'].map({0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'})
df['month_en'] = df['month'].map({1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'})
# df['only_date'] = df['date_time'].dt.date
df.sample(20)
return df
def fetch_stats(selected_user, df):
if selected_user == 'All':
user_df = df
else:
user_df = df[df['user'] == selected_user]
total_messages = user_df.shape[0]
media_messages = user_df[user_df['message'] == '<Media omitted>\n']
media_messages = media_messages.shape[0]
links = user_df[user_df['message'].str.contains('http')]
print(links)
links = links.shape[0]
emojis = user_df[user_df['message'].str.contains('[\U0001F600-\U0001F650]')].shape[0]
words = user_df['message'].apply(lambda x: len(x.split()))
total_words = words.sum()
return user_df, total_messages, media_messages, links, emojis, total_words
def busiest_users(df):
user_message_counts = df['user'].value_counts().reset_index()
user_message_counts.columns = ['user', 'message_count']
user_message_counts['percentage'] = (user_message_counts['message_count'] / user_message_counts['message_count'].sum()) * 100
user_message_counts['percentage'] = user_message_counts['percentage'].round(2)
user_message_counts = user_message_counts.sort_values(by='message_count', ascending=False)
busiest_users = user_message_counts
plt.bar(busiest_users.user, busiest_users.message_count)
plt.xticks(rotation=45)
plt.title('Busiest users')
plt.xlabel('Users')
plt.ylabel('Total messages')
return busiest_users, plt
def word_cloud(df, selected_user):
if selected_user == 'All':
user_df = df
else:
user_df = df[df['user'] == selected_user]
f = open('stop_hinglish.txt','r')
stop_words = f.read()
temp = user_df[user_df['user'] != 'group_notification']
temp = temp[temp['message'] != '<Media omitted>\n']
temp = temp[temp['message'] != 'This message was deleted\n']
temp = temp[temp['message'] != 'You deleted this message\n']
words = []
for message in temp['message']:
for word in message.lower().split():
if word not in stop_words:
words.append(word)
words = ' '.join(words)
wordcloud = WordCloud(width=800, height=400, random_state=21, max_font_size=110, background_color='white')
wordcloud = wordcloud.generate(words)
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
return plt
def most_common_words(selected_user, df):
f = open('stop_hinglish.txt','r')
stop_words = f.read()
if selected_user != 'All':
df = df[df['user'] == selected_user]
temp = df[df['user'] != 'group_notification']
temp = temp[temp['message'] != '<Media omitted>\n']
temp = temp[temp['message'] != 'This message was deleted\n']
temp = temp[temp['message'] != 'You deleted this message\n']
words = []
for message in temp['message']:
for word in message.lower().split():
if word not in stop_words:
words.append(word)
most_common_df = pd.DataFrame(Counter(words).most_common(50))
most_common_df.columns = ['word', 'word_count']
fig, ax = plt.subplots(figsize=(10, 15))
ax.barh(most_common_df.word, most_common_df.word_count, height=0.8)
ax.set_xlabel('Word count')
ax.set_ylabel('Words')
ax.set_title('Most common words')
return most_common_df, fig
def most_common_emojis(selected_user, df):
if selected_user != 'All':
df = df[df['user'] == selected_user]
emojis = []
for message in df['message']:
emojis.extend([c for c in message if c in emoji.EMOJI_DATA])
emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))
emoji_df.columns = ['emoji', 'emoji_count']
return emoji_df
def monthly_timeline(selected_user, df):
if selected_user != 'All':
df = df[df['user'] == selected_user]
monthly_timeline = df.groupby(['year', 'month_en']).count()['message'].reset_index()
monthly_timeline['month_year'] = monthly_timeline['month_en'] + ' ' + monthly_timeline['year'].astype(str)
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(monthly_timeline['month_year'], monthly_timeline['message'])
ax.set_title('Messages sent over Months')
ax.set_xlabel('Month-Year')
ax.set_ylabel('Total messages')
return monthly_timeline, fig
def daily_timeline(selected_user, df):
if selected_user != 'All':
df = df[df['user'] == selected_user]
daily_timeline = df.groupby('date').count()['message'].reset_index()
daily_timeline.columns = ['date', 'message_count']
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(daily_timeline['date'], daily_timeline['message_count'])
ax.set_title('Messages sent over time')
ax.set_xlabel('Date')
ax.set_ylabel('Total messages')
return daily_timeline, fig
def weekday_activity_map(selected_user, df):
if selected_user != 'All':
df = df[df['user'] == selected_user]
week_df = df['weekday_en'].value_counts().reset_index()
week_df.columns = ['weekday', 'message_count']
fig, ax = plt.subplots(figsize=(10, 5))
ax.bar(week_df['weekday'], week_df['message_count'])
ax.set_title('Messages sent per weekday')
ax.set_xlabel('Weekday')
ax.set_ylabel('Total messages')
return None, fig
def month_activity_map(selected_user, df):
if selected_user != 'All':
df = df[df['user'] == selected_user]
month_df = df['month_en'].value_counts().reset_index()
month_df.columns = ['month', 'message_count']
fig, ax = plt.subplots(figsize=(10, 5))
ax.bar(month_df['month'], month_df['message_count'])
ax.set_title('Messages sent per month')
ax.set_xlabel('Month')
ax.set_ylabel('Total messages')
return month_df, fig
def hour_activity_map(selected_user, df):
if selected_user != 'All':
df = df[df['user'] == selected_user]
# Count the number of messages per hour
hour_counts = df['hour'].value_counts().sort_index()
# Convert hours to radians
hours = np.arange(24)
radians = 2 * np.pi * (hours / 24)
# Create a polar plot
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw={'projection': 'polar'})
ax.bar(radians, hour_counts, width=0.3, bottom=0.2)
ax.set_theta_direction(-1) # Clockwise
ax.set_theta_offset(np.pi / 2.0) # Start from top
ax.set_xticks(radians)
ax.set_xticklabels(hours)
ax.set_yticklabels([])
ax.set_title('Busiest Hours of the Day', va='bottom')
return hour_counts, fig
def activity_heatmap(selected_user, df):
if selected_user != 'All':
df = df[df['user'] == selected_user]
heatmap_data = df.pivot_table(index='weekday_en', columns='hour', values='message', aggfunc='count', fill_value=0)
plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_data, cmap='gray_r', linewidths=.5, fmt='d')
plt.title('Activity Heatmap')
plt.xlabel('Hour of Day')
plt.ylabel('Day of Week')
return plt
def extract_links(df):
links = df[df['message'].str.contains('http', na=False)]
links = links['message'].str.extractall(r'(https?://\S+)')[0]
return links.reset_index(drop=True)
def plot_common_domains(df):
links = extract_links(df)
domains = links.str.extract(r'https?://(?:www\.)?([^/]+)')[0]
domain_counts = domains.value_counts().reset_index()
domain_counts.columns = ['domain', 'count']
fig, ax = plt.subplots(figsize=(15, 5))
ax.bar(domain_counts['domain'], domain_counts['count'], width=0.5)
plt.xticks(rotation='vertical')
ax.set_title('Most Common Domains')
ax.set_xlabel('Domain')
ax.set_ylabel('Count')
return domain_counts, fig