from turtle import back import pandas as pd import re import matplotlib.pyplot as plt from wordcloud import WordCloud from collections import Counter import emoji import numpy as np import seaborn as sns def preprocess_data(chat_data): date_time_pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s' messages = re.split(date_time_pattern, chat_data)[1:] date_times = re.findall(date_time_pattern, chat_data) df = pd.DataFrame({'date_time': date_times, 'message': messages}) df['date_time'] = df['date_time'].str.replace('\u202f', ' ') df['date_time'] = pd.to_datetime(df['date_time'], format='%m/%d/%y, %I:%M %p - ') #separate users and their corresponding messages users = [] messages = [] for message in df['message']: entry = re.split('([\w\W]+?):\s', message) if len(entry) > 1: users.append(entry[1]) messages.append(entry[2]) else: users.append('group_notification') messages.append(entry[0]) df['user'] = users df['message'] = messages # extract date month year hour and minute from date_time df['date'] = df['date_time'].dt.date df['time'] = df['date_time'].dt.time df['hour'] = df['date_time'].dt.hour df['minute'] = df['date_time'].dt.minute df['day'] = df['date_time'].dt.day df['month'] = df['date_time'].dt.month df['year'] = df['date_time'].dt.year df['weekday'] = df['date_time'].dt.weekday df['weekday_en'] = df['weekday'].map({0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'}) df['month_en'] = df['month'].map({1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}) # df['only_date'] = df['date_time'].dt.date df.sample(20) return df def fetch_stats(selected_user, df): if selected_user == 'All': user_df = df else: user_df = df[df['user'] == selected_user] total_messages = user_df.shape[0] media_messages = user_df[user_df['message'] == '\n'] media_messages = media_messages.shape[0] links = user_df[user_df['message'].str.contains('http')] print(links) links = links.shape[0] emojis = user_df[user_df['message'].str.contains('[\U0001F600-\U0001F650]')].shape[0] words = user_df['message'].apply(lambda x: len(x.split())) total_words = words.sum() return user_df, total_messages, media_messages, links, emojis, total_words def busiest_users(df): user_message_counts = df['user'].value_counts().reset_index() user_message_counts.columns = ['user', 'message_count'] user_message_counts['percentage'] = (user_message_counts['message_count'] / user_message_counts['message_count'].sum()) * 100 user_message_counts['percentage'] = user_message_counts['percentage'].round(2) user_message_counts = user_message_counts.sort_values(by='message_count', ascending=False) busiest_users = user_message_counts plt.bar(busiest_users.user, busiest_users.message_count) plt.xticks(rotation=45) plt.title('Busiest users') plt.xlabel('Users') plt.ylabel('Total messages') return busiest_users, plt def word_cloud(df, selected_user): if selected_user == 'All': user_df = df else: user_df = df[df['user'] == selected_user] f = open('stop_hinglish.txt','r') stop_words = f.read() temp = user_df[user_df['user'] != 'group_notification'] temp = temp[temp['message'] != '\n'] temp = temp[temp['message'] != 'This message was deleted\n'] temp = temp[temp['message'] != 'You deleted this message\n'] words = [] for message in temp['message']: for word in message.lower().split(): if word not in stop_words: words.append(word) words = ' '.join(words) wordcloud = WordCloud(width=800, height=400, random_state=21, max_font_size=110, background_color='white') wordcloud = wordcloud.generate(words) plt.figure(figsize=(20, 10)) plt.imshow(wordcloud, interpolation="bilinear") plt.axis('off') return plt def most_common_words(selected_user, df): f = open('stop_hinglish.txt','r') stop_words = f.read() if selected_user != 'All': df = df[df['user'] == selected_user] temp = df[df['user'] != 'group_notification'] temp = temp[temp['message'] != '\n'] temp = temp[temp['message'] != 'This message was deleted\n'] temp = temp[temp['message'] != 'You deleted this message\n'] words = [] for message in temp['message']: for word in message.lower().split(): if word not in stop_words: words.append(word) most_common_df = pd.DataFrame(Counter(words).most_common(50)) most_common_df.columns = ['word', 'word_count'] fig, ax = plt.subplots(figsize=(10, 15)) ax.barh(most_common_df.word, most_common_df.word_count, height=0.8) ax.set_xlabel('Word count') ax.set_ylabel('Words') ax.set_title('Most common words') return most_common_df, fig def most_common_emojis(selected_user, df): if selected_user != 'All': df = df[df['user'] == selected_user] emojis = [] for message in df['message']: emojis.extend([c for c in message if c in emoji.EMOJI_DATA]) emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis)))) emoji_df.columns = ['emoji', 'emoji_count'] return emoji_df def monthly_timeline(selected_user, df): if selected_user != 'All': df = df[df['user'] == selected_user] monthly_timeline = df.groupby(['year', 'month_en']).count()['message'].reset_index() monthly_timeline['month_year'] = monthly_timeline['month_en'] + ' ' + monthly_timeline['year'].astype(str) fig, ax = plt.subplots(figsize=(10, 5)) ax.plot(monthly_timeline['month_year'], monthly_timeline['message']) ax.set_title('Messages sent over Months') ax.set_xlabel('Month-Year') ax.set_ylabel('Total messages') return monthly_timeline, fig def daily_timeline(selected_user, df): if selected_user != 'All': df = df[df['user'] == selected_user] daily_timeline = df.groupby('date').count()['message'].reset_index() daily_timeline.columns = ['date', 'message_count'] fig, ax = plt.subplots(figsize=(10, 5)) ax.plot(daily_timeline['date'], daily_timeline['message_count']) ax.set_title('Messages sent over time') ax.set_xlabel('Date') ax.set_ylabel('Total messages') return daily_timeline, fig def weekday_activity_map(selected_user, df): if selected_user != 'All': df = df[df['user'] == selected_user] week_df = df['weekday_en'].value_counts().reset_index() week_df.columns = ['weekday', 'message_count'] fig, ax = plt.subplots(figsize=(10, 5)) ax.bar(week_df['weekday'], week_df['message_count']) ax.set_title('Messages sent per weekday') ax.set_xlabel('Weekday') ax.set_ylabel('Total messages') return None, fig def month_activity_map(selected_user, df): if selected_user != 'All': df = df[df['user'] == selected_user] month_df = df['month_en'].value_counts().reset_index() month_df.columns = ['month', 'message_count'] fig, ax = plt.subplots(figsize=(10, 5)) ax.bar(month_df['month'], month_df['message_count']) ax.set_title('Messages sent per month') ax.set_xlabel('Month') ax.set_ylabel('Total messages') return month_df, fig def hour_activity_map(selected_user, df): if selected_user != 'All': df = df[df['user'] == selected_user] # Count the number of messages per hour hour_counts = df['hour'].value_counts().sort_index() # Convert hours to radians hours = np.arange(24) radians = 2 * np.pi * (hours / 24) # Create a polar plot fig, ax = plt.subplots(figsize=(10, 10), subplot_kw={'projection': 'polar'}) ax.bar(radians, hour_counts, width=0.3, bottom=0.2) ax.set_theta_direction(-1) # Clockwise ax.set_theta_offset(np.pi / 2.0) # Start from top ax.set_xticks(radians) ax.set_xticklabels(hours) ax.set_yticklabels([]) ax.set_title('Busiest Hours of the Day', va='bottom') return hour_counts, fig def activity_heatmap(selected_user, df): if selected_user != 'All': df = df[df['user'] == selected_user] heatmap_data = df.pivot_table(index='weekday_en', columns='hour', values='message', aggfunc='count', fill_value=0) plt.figure(figsize=(10, 6)) sns.heatmap(heatmap_data, cmap='gray_r', linewidths=.5, fmt='d') plt.title('Activity Heatmap') plt.xlabel('Hour of Day') plt.ylabel('Day of Week') return plt