Delete pages/π±_X_Scrapping.py
Browse files- pages/π±_X_Scrapping.py +0 -547
pages/π±_X_Scrapping.py
DELETED
@@ -1,547 +0,0 @@
|
|
1 |
-
# Data Analysis and Profiling
|
2 |
-
import pandas as pd
|
3 |
-
from ydata_profiling import ProfileReport
|
4 |
-
from streamlit_pandas_profiling import st_profile_report
|
5 |
-
|
6 |
-
# Streamlit for Building the Dashboard
|
7 |
-
import streamlit as st
|
8 |
-
import streamlit_pandas_profiling
|
9 |
-
|
10 |
-
# Language Detection
|
11 |
-
from langdetect import detect
|
12 |
-
|
13 |
-
# NLP and Text Processing
|
14 |
-
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
15 |
-
from deep_translator import GoogleTranslator
|
16 |
-
import nltk
|
17 |
-
from nltk.corpus import stopwords
|
18 |
-
from nltk.stem import WordNetLemmatizer
|
19 |
-
from bs4 import BeautifulSoup
|
20 |
-
|
21 |
-
# Sentiment Analysis
|
22 |
-
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
23 |
-
from textblob import TextBlob
|
24 |
-
|
25 |
-
# URL Parsing
|
26 |
-
from urllib.parse import urlparse
|
27 |
-
|
28 |
-
# Data Visualization
|
29 |
-
import plotly.express as px
|
30 |
-
import matplotlib.pyplot as plt
|
31 |
-
|
32 |
-
# Word Cloud Generation
|
33 |
-
from wordcloud import WordCloud
|
34 |
-
|
35 |
-
# Other Libraries
|
36 |
-
import torch
|
37 |
-
import requests
|
38 |
-
import subprocess
|
39 |
-
import logging
|
40 |
-
import re
|
41 |
-
import os
|
42 |
-
|
43 |
-
# NLTK Data Download
|
44 |
-
nltk.download('wordnet')
|
45 |
-
nltk.download('punkt')
|
46 |
-
|
47 |
-
## ............................................... ##
|
48 |
-
# Set page configuration (Call this once and make changes as needed)
|
49 |
-
st.set_page_config(page_title='(Tweet) X Scrapper Dashboard', layout='wide', page_icon=':rocket:')
|
50 |
-
|
51 |
-
|
52 |
-
## ............................................... ##
|
53 |
-
with st.container():
|
54 |
-
# Define Streamlit app title and introduction
|
55 |
-
st.title("(Tweet) X Scrapper Dashboard")
|
56 |
-
st.write("Created by Bayhaqy")
|
57 |
-
|
58 |
-
# Sidebar content
|
59 |
-
st.sidebar.subheader("About the app")
|
60 |
-
st.sidebar.info("This app allows you to get data, analysis and prediction with the (Tweet) X Scrapper tool.")
|
61 |
-
|
62 |
-
url = "https://blogs.bayhaqy.my.id/2023/10/auth-token-twitter.html"
|
63 |
-
st.sidebar.markdown("check this [link](%s) for guides on how to get your own X Auth Token" % url)
|
64 |
-
|
65 |
-
st.sidebar.write("\n\n")
|
66 |
-
st.sidebar.markdown("**Please contact me if you have any questions**")
|
67 |
-
st.sidebar.write("\n\n")
|
68 |
-
st.sidebar.divider()
|
69 |
-
st.sidebar.markdown("Β© 2023 (Tweet) X Scrapper Dashboard")
|
70 |
-
|
71 |
-
## ............................................... ##
|
72 |
-
# Function to install Node.js
|
73 |
-
@st.cache_data
|
74 |
-
def install_nodejs():
|
75 |
-
try:
|
76 |
-
# Check if Node.js is already installed by attempting to get its version.
|
77 |
-
node_major_version = int(subprocess.check_output(['node', '-v']).decode("utf-8").split('.')[0][1:])
|
78 |
-
except FileNotFoundError:
|
79 |
-
# If 'node' command is not found, it means Node.js is not installed.
|
80 |
-
node_major_version = 0
|
81 |
-
|
82 |
-
if node_major_version < 20:
|
83 |
-
st.markdown('Update OS')
|
84 |
-
|
85 |
-
check_os1 = subprocess.check_output(['lsb_release', '-a']).decode("utf-8")
|
86 |
-
st.markdown(f'OS version: {check_os1}')
|
87 |
-
check_os2 = subprocess.check_output(['uname', '-r']).decode("utf-8")
|
88 |
-
st.markdown(f'OS Kernel version: {check_os2}')
|
89 |
-
check_ver = subprocess.check_output(['python', '--version']).decode("utf-8")
|
90 |
-
st.markdown(f'Python version: {check_ver}')
|
91 |
-
|
92 |
-
subprocess.check_call(['sudo', 'apt-get', 'update'])
|
93 |
-
|
94 |
-
st.markdown('Download Files Requirement for Nodesource')
|
95 |
-
subprocess.check_call(['sudo', 'apt-get', 'install', '-y', 'ca-certificates', 'curl', 'gnupg'])
|
96 |
-
subprocess.check_call(['sudo', 'mkdir', '-p', '/etc/apt/keyrings'])
|
97 |
-
subprocess.check_call(f'curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg', shell=True)
|
98 |
-
|
99 |
-
NODE_MAJOR = 20
|
100 |
-
node_source_entry = f"deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_{NODE_MAJOR}.x nodistro main"
|
101 |
-
subprocess.check_call(f'echo "{node_source_entry}" | sudo tee /etc/apt/sources.list.d/nodesource.list', shell=True)
|
102 |
-
|
103 |
-
st.markdown('Install Node.js')
|
104 |
-
subprocess.check_call(['sudo', 'apt-get', 'update'])
|
105 |
-
subprocess.check_call(['sudo', 'apt-get', 'install', 'nodejs', '-y'])
|
106 |
-
|
107 |
-
result = subprocess.check_output(['node', '-v']).decode("utf-8")
|
108 |
-
st.markdown(f'Node.js version: {result}')
|
109 |
-
else:
|
110 |
-
st.markdown('Node.js version already installed')
|
111 |
-
result = subprocess.check_output(['node', '-v']).decode("utf-8")
|
112 |
-
st.markdown(f'Node.js version already updated to {result}')
|
113 |
-
|
114 |
-
## ............................................... ##
|
115 |
-
# Function to run tweet-harvest
|
116 |
-
@st.cache_data
|
117 |
-
def run_X_scrapping(search_keyword,from_date,to_date,limit,delay,token,filename):
|
118 |
-
# Run tweet-harvest with the provided parameters
|
119 |
-
#st.markdown('Check Tweet')
|
120 |
-
command = f'npx --yes tweet-harvest@latest -s "{search_keyword}" -f "{from_date}" -t "{to_date}" -l {limit} -d {delay} --token "{token}" -o "{filename}"'
|
121 |
-
try:
|
122 |
-
result = subprocess.run(command, shell=True, capture_output=True, text=True, check=True)
|
123 |
-
st.markdown("Command executed successfully.")
|
124 |
-
st.markdown(result.stdout) # Display the standard output, give comment if you don't want to see
|
125 |
-
except subprocess.CalledProcessError as e:
|
126 |
-
st.markdown("Error: The command returned a non-zero exit status.")
|
127 |
-
st.markdown("Error message:", e)
|
128 |
-
st.markdown(f'Standard output: {e.stdout}')
|
129 |
-
st.markdown(f'Standard error: {e.stderr}')
|
130 |
-
|
131 |
-
## ............................................... ##
|
132 |
-
# Function for get model and tokenize
|
133 |
-
@st.cache_resource
|
134 |
-
def get_models_and_tokenizers():
|
135 |
-
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
|
136 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
137 |
-
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
|
138 |
-
#model.eval()
|
139 |
-
|
140 |
-
return model, tokenizer
|
141 |
-
|
142 |
-
## ............................................... ##
|
143 |
-
# Function for sentiment analysis
|
144 |
-
@st.cache_resource
|
145 |
-
def analyze_sentiment_distilbert(text, _model, _tokenizer):
|
146 |
-
try:
|
147 |
-
tokens_info = _tokenizer(text, truncation=True, return_tensors="pt")
|
148 |
-
with torch.no_grad():
|
149 |
-
raw_predictions = _model(**tokens_info).logits
|
150 |
-
|
151 |
-
predicted_class_id = raw_predictions.argmax().item()
|
152 |
-
predict = _model.config.id2label[predicted_class_id]
|
153 |
-
|
154 |
-
softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100)
|
155 |
-
if (softmaxed > 70):
|
156 |
-
status = 'Not trust'
|
157 |
-
elif (softmaxed > 40):
|
158 |
-
status = 'Not sure'
|
159 |
-
else:
|
160 |
-
status = 'Trust'
|
161 |
-
return status, predict
|
162 |
-
|
163 |
-
except Exception as e:
|
164 |
-
logging.error(f"Sentiment analysis error: {str(e)}")
|
165 |
-
return 'N/A', 'N/A'
|
166 |
-
|
167 |
-
## ............................................... ##
|
168 |
-
# Function for sentiment analysis using VADER
|
169 |
-
@st.cache_resource
|
170 |
-
def analyze_sentiment_vader(text):
|
171 |
-
analyzer = SentimentIntensityAnalyzer()
|
172 |
-
sentiment = analyzer.polarity_scores(text)
|
173 |
-
compound_score = sentiment['compound']
|
174 |
-
if compound_score >= 0.05:
|
175 |
-
return 'Positive'
|
176 |
-
elif compound_score <= -0.05:
|
177 |
-
return 'Negative'
|
178 |
-
else:
|
179 |
-
return 'Neutral'
|
180 |
-
|
181 |
-
## ............................................... ##
|
182 |
-
# Function for sentiment analysis using TextBlob
|
183 |
-
@st.cache_resource
|
184 |
-
def analyze_sentiment_textblob(text):
|
185 |
-
analysis = TextBlob(text)
|
186 |
-
polarity = analysis.sentiment.polarity
|
187 |
-
if polarity > 0:
|
188 |
-
return 'Positive'
|
189 |
-
elif polarity < 0:
|
190 |
-
return 'Negative'
|
191 |
-
else:
|
192 |
-
return 'Neutral'
|
193 |
-
|
194 |
-
## ............................................... ##
|
195 |
-
# Function for translation
|
196 |
-
@st.cache_data
|
197 |
-
def translate_text(text, source='auto', target='en'):
|
198 |
-
try:
|
199 |
-
if source != target:
|
200 |
-
text = GoogleTranslator(source=source, target=target).translate(text)
|
201 |
-
return text
|
202 |
-
|
203 |
-
except Exception as e:
|
204 |
-
logging.error(f"Translation error: {str(e)}")
|
205 |
-
return text
|
206 |
-
|
207 |
-
## ............................................... ##
|
208 |
-
# Function for Load and Transform Data
|
209 |
-
@st.cache_data
|
210 |
-
def selection_data(filename):
|
211 |
-
file_path = f"tweets-data/{filename}"
|
212 |
-
df = pd.read_csv(file_path, delimiter=";")
|
213 |
-
|
214 |
-
|
215 |
-
# Rename columns
|
216 |
-
column_mapping = {
|
217 |
-
'created_at': 'Created Date',
|
218 |
-
'user_id_str': 'User ID',
|
219 |
-
'username': 'Username',
|
220 |
-
'full_text': 'Tweet',
|
221 |
-
'tweet_url': 'Tweet URL',
|
222 |
-
'id_str': 'Tweet ID',
|
223 |
-
'conversation_id_str': 'Conversation ID',
|
224 |
-
'lang': 'App Language',
|
225 |
-
'quote_count': 'Quote Count',
|
226 |
-
'reply_count': 'Reply Count',
|
227 |
-
'retweet_count': 'Retweet Count',
|
228 |
-
'favorite_count': 'Favorite Count',
|
229 |
-
}
|
230 |
-
|
231 |
-
df = df.rename(columns=column_mapping)
|
232 |
-
|
233 |
-
# Add a new column for detected language
|
234 |
-
df['Detect Language'] = df['Tweet'].apply(lambda tweet: detect(tweet))
|
235 |
-
|
236 |
-
# Mapping language codes to country names
|
237 |
-
language_to_country = {
|
238 |
-
'af': 'South Africa',
|
239 |
-
'ar': 'Arabic',
|
240 |
-
'bg': 'Bulgaria',
|
241 |
-
'bn': 'Bangladesh',
|
242 |
-
'ca': 'Catalan',
|
243 |
-
'cs': 'Czech',
|
244 |
-
'cy': 'Welsh',
|
245 |
-
'da': 'Danish',
|
246 |
-
'de': 'German',
|
247 |
-
'el': 'Greek',
|
248 |
-
'en': 'English',
|
249 |
-
'es': 'Spanish',
|
250 |
-
'et': 'Estonian',
|
251 |
-
'fa': 'Persian',
|
252 |
-
'fi': 'Finnish',
|
253 |
-
'fr': 'French',
|
254 |
-
'gu': 'Gujarati',
|
255 |
-
'he': 'Hebrew',
|
256 |
-
'hi': 'Hindi',
|
257 |
-
'hr': 'Croatian',
|
258 |
-
'hu': 'Hungarian',
|
259 |
-
'id': 'Indonesian',
|
260 |
-
'it': 'Italian',
|
261 |
-
'ja': 'Japanese',
|
262 |
-
'kn': 'Kannada',
|
263 |
-
'ko': 'Korean',
|
264 |
-
'lt': 'Lithuanian',
|
265 |
-
'lv': 'Latvian',
|
266 |
-
'mk': 'Macedonian',
|
267 |
-
'ml': 'Malayalam',
|
268 |
-
'mr': 'Marathi',
|
269 |
-
'ne': 'Nepali',
|
270 |
-
'nl': 'Dutch',
|
271 |
-
'no': 'Norwegian',
|
272 |
-
'pa': 'Punjabi',
|
273 |
-
'pl': 'Polish',
|
274 |
-
'pt': 'Portuguese',
|
275 |
-
'ro': 'Romanian',
|
276 |
-
'ru': 'Russian',
|
277 |
-
'sk': 'Slovak',
|
278 |
-
'sl': 'Slovenian',
|
279 |
-
'so': 'Somali',
|
280 |
-
'sq': 'Albanian',
|
281 |
-
'sv': 'Swedish',
|
282 |
-
'sw': 'Swahili',
|
283 |
-
'ta': 'Tamil',
|
284 |
-
'te': 'Telugu',
|
285 |
-
'th': 'Thai',
|
286 |
-
'tl': 'Tagalog',
|
287 |
-
'tr': 'Turkish',
|
288 |
-
'uk': 'Ukrainian',
|
289 |
-
'ur': 'Urdu',
|
290 |
-
'vi': 'Vietnamese',
|
291 |
-
'zh-cn': 'Simplified Chinese',
|
292 |
-
'zh-tw': 'Traditional Chinese'
|
293 |
-
}
|
294 |
-
|
295 |
-
# Add 'Country' column to df
|
296 |
-
df['Language'] = df['Detect Language'].map(language_to_country)
|
297 |
-
|
298 |
-
# Sort columns
|
299 |
-
desired_columns = ['Created Date', 'User ID', 'Username', 'Tweet', 'Language', 'Detect Language', 'App Language', 'Tweet URL', 'Tweet ID', 'Conversation ID', 'Quote Count', 'Reply Count', 'Retweet Count', 'Favorite Count']
|
300 |
-
df = df[desired_columns]
|
301 |
-
|
302 |
-
# Set data types
|
303 |
-
data_types = {
|
304 |
-
'Created Date': 'datetime64[ns]',
|
305 |
-
'User ID': 'int64',
|
306 |
-
'Username': 'object',
|
307 |
-
'Tweet': 'object',
|
308 |
-
'Language': 'object',
|
309 |
-
'Detect Language': 'object',
|
310 |
-
'App Language': 'object',
|
311 |
-
'Tweet URL': 'object',
|
312 |
-
'Tweet ID': 'int64',
|
313 |
-
'Conversation ID': 'int64',
|
314 |
-
'Quote Count': 'int64',
|
315 |
-
'Reply Count': 'int64',
|
316 |
-
'Retweet Count': 'int64',
|
317 |
-
'Favorite Count': 'int64',
|
318 |
-
}
|
319 |
-
|
320 |
-
df = df.astype(data_types)
|
321 |
-
|
322 |
-
return df
|
323 |
-
|
324 |
-
## ............................................... ##
|
325 |
-
# Function to preprocess the data
|
326 |
-
@st.cache_data
|
327 |
-
def preprocessing_data(df):
|
328 |
-
# Remove duplicates
|
329 |
-
df = df.drop_duplicates(subset='Translation')
|
330 |
-
|
331 |
-
# Function to clean and preprocess text
|
332 |
-
def clean_text(text):
|
333 |
-
# Remove mentions (e.g., @username)
|
334 |
-
text = re.sub(r'@[\w]+', '', text)
|
335 |
-
|
336 |
-
# Remove URLs
|
337 |
-
text = re.sub(r'http\S+', '', text)
|
338 |
-
|
339 |
-
# Remove HTML tags
|
340 |
-
text = BeautifulSoup(text, 'html.parser').get_text()
|
341 |
-
|
342 |
-
# Convert to lowercase
|
343 |
-
text = text.lower()
|
344 |
-
|
345 |
-
# Remove non-alphanumeric characters
|
346 |
-
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
347 |
-
|
348 |
-
# Tokenize text
|
349 |
-
words = nltk.word_tokenize(text)
|
350 |
-
|
351 |
-
# Remove stopwords
|
352 |
-
stop_words = set(stopwords.words('english'))
|
353 |
-
words = [word for word in words if word not in stop_words]
|
354 |
-
|
355 |
-
# Lemmatize words
|
356 |
-
lemmatizer = WordNetLemmatizer()
|
357 |
-
words = [lemmatizer.lemmatize(word) for word in words]
|
358 |
-
|
359 |
-
return ' '.join(words)
|
360 |
-
|
361 |
-
# Apply the clean_text function to the "Translation" column
|
362 |
-
df['Cleaned Translation'] = df['Translation'].apply(clean_text)
|
363 |
-
|
364 |
-
return df
|
365 |
-
|
366 |
-
## ............................................... ##
|
367 |
-
# Function to create a Word Cloud
|
368 |
-
@st.cache_data
|
369 |
-
def create_wordcloud(df):
|
370 |
-
# Combine all text
|
371 |
-
text = ' '.join(df['Cleaned Translation'])
|
372 |
-
|
373 |
-
# Create a Word Cloud
|
374 |
-
wordcloud = WordCloud(width=700, height=400, max_words=50).generate(text)
|
375 |
-
|
376 |
-
# Convert the word cloud to an image
|
377 |
-
wordcloud_image = wordcloud.to_image()
|
378 |
-
|
379 |
-
# Display the Word Cloud using st.image
|
380 |
-
st.write("word Cloud by Tweets")
|
381 |
-
st.image(wordcloud_image, use_column_width=True)
|
382 |
-
|
383 |
-
## ............................................... ##
|
384 |
-
# IMPORTANT: Cache the conversion to prevent computation on every rerun
|
385 |
-
@st.cache_data
|
386 |
-
def convert_df(df):
|
387 |
-
return df.to_csv().encode('utf-8')
|
388 |
-
|
389 |
-
## ............................................... ##
|
390 |
-
# Set up logging
|
391 |
-
logging.basicConfig(filename='tweet_harvest.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
392 |
-
|
393 |
-
## ............................................... ##
|
394 |
-
with st.container():
|
395 |
-
# Input search parameters
|
396 |
-
search_keyword = st.text_input("Enter search keyword", "Jakarta",)
|
397 |
-
|
398 |
-
col1, col2 = st.columns(2)
|
399 |
-
|
400 |
-
with col1:
|
401 |
-
from_date = st.date_input('From Date :', pd.to_datetime('2023-01-01'))
|
402 |
-
to_date = st.date_input('To Date :', pd.to_datetime('2023-12-01'))
|
403 |
-
with col2:
|
404 |
-
limit = st.number_input("Enter limit", min_value=10, value=10, max_value=100)
|
405 |
-
delay = st.number_input("Enter delay in seconds", min_value=1, value=3)
|
406 |
-
|
407 |
-
token = st.text_input("Enter your X Auth Token", type="password")
|
408 |
-
|
409 |
-
## ............................................... ##
|
410 |
-
with st.container():
|
411 |
-
col1, col2 = st.columns(2)
|
412 |
-
|
413 |
-
with col1:
|
414 |
-
# Checkbox options for different processing steps
|
415 |
-
include_translation = st.checkbox("Include Translation", value=False)
|
416 |
-
include_sentiment_analysis = st.checkbox("Include Sentiment Analysis", value=False)
|
417 |
-
with col2:
|
418 |
-
include_sentiment_vader = st.checkbox("Include VADER Sentiment Analysis", value=False)
|
419 |
-
include_sentiment_textblob = st.checkbox("Include TextBlob Sentiment Analysis", value=False)
|
420 |
-
|
421 |
-
## ............................................... ##
|
422 |
-
# Initialize to install node.js
|
423 |
-
install_nodejs()
|
424 |
-
|
425 |
-
# Initialize model and tokenizer
|
426 |
-
model, tokenizer = get_models_and_tokenizers()
|
427 |
-
|
428 |
-
# Create a variable to track whether the data has been processed
|
429 |
-
data_processed = False
|
430 |
-
|
431 |
-
## ............................................... ##
|
432 |
-
# Create a button to trigger tweet-harvest
|
433 |
-
with st.container():
|
434 |
-
if st.button("Run it"):
|
435 |
-
# Format the dates as "DD-MM-YYYY"
|
436 |
-
from_date = from_date.strftime("%d-%m-%Y")
|
437 |
-
to_date = to_date.strftime("%d-%m-%Y")
|
438 |
-
|
439 |
-
filename = 'tweets_data.csv'
|
440 |
-
|
441 |
-
run_X_scrapping(search_keyword,from_date,to_date,limit,delay,token,filename)
|
442 |
-
|
443 |
-
df = selection_data(filename)
|
444 |
-
|
445 |
-
# Conditionally apply translation function to the 'Translation' column
|
446 |
-
if include_translation:
|
447 |
-
df['Translation'] = df.apply(lambda row: translate_text((row['Tweet']), source=row['Detect Language'], target='en'), axis=1)
|
448 |
-
df = preprocessing_data(df)
|
449 |
-
|
450 |
-
# Conditionally apply sentiment analysis function to the 'Translation' column
|
451 |
-
if include_sentiment_analysis:
|
452 |
-
df[['Fake Check', 'Sentiment Distilbert']] = df['Translation'].apply(lambda text: pd.Series(analyze_sentiment_distilbert(text, model, tokenizer))).apply(lambda x: x.str.title())
|
453 |
-
|
454 |
-
# Conditionally apply VADER sentiment analysis to the 'Translation' column
|
455 |
-
if include_sentiment_vader:
|
456 |
-
df['Sentiment VADER'] = df['Translation'].apply(analyze_sentiment_vader)
|
457 |
-
|
458 |
-
# Conditionally apply TextBlob sentiment analysis to the 'Translation' column
|
459 |
-
if include_sentiment_textblob:
|
460 |
-
df['Sentiment TextBlob'] = df['Translation'].apply(analyze_sentiment_textblob)
|
461 |
-
|
462 |
-
# Set data_processed to True when the data has been successfully processed
|
463 |
-
data_processed = True
|
464 |
-
|
465 |
-
## ............................................... ##
|
466 |
-
# Add a button to download the data as a CSV file
|
467 |
-
if data_processed:
|
468 |
-
st.markdown("### Download Processed Data as CSV")
|
469 |
-
st.write("Click the button below to download the processed data as a CSV file.")
|
470 |
-
csv_data = convert_df(df)
|
471 |
-
|
472 |
-
# Create a downloadable link
|
473 |
-
st.download_button(
|
474 |
-
label="Download data as CSV",
|
475 |
-
data=csv_data,
|
476 |
-
file_name='processed_data.csv',
|
477 |
-
mime='text/csv',
|
478 |
-
)
|
479 |
-
|
480 |
-
with st.expander("See Table"):
|
481 |
-
## ............................................... ##
|
482 |
-
# Display processed data
|
483 |
-
st.dataframe(df)
|
484 |
-
|
485 |
-
# Display processed data
|
486 |
-
with st.expander("See EDA"):
|
487 |
-
## ............................................... ##
|
488 |
-
# Create a Streamlit app
|
489 |
-
st.subheader("Tweet Data Visualization")
|
490 |
-
|
491 |
-
col1, col2 = st.columns(2)
|
492 |
-
with col1:
|
493 |
-
## ............................................... ##
|
494 |
-
# Create a new column with a count of 1 for each tweet
|
495 |
-
df_date = pd.DataFrame(df['Created Date'])
|
496 |
-
df_date['Tweet Count'] = 1
|
497 |
-
|
498 |
-
# Resample the data per second and calculate the count
|
499 |
-
data_resampled = df_date.resample('S', on='Created Date')['Tweet Count'].count().reset_index()
|
500 |
-
|
501 |
-
# Create a time series plot with custom styling
|
502 |
-
fig = px.line(data_resampled, x='Created Date', y='Tweet Count', title='Tweet Counts Over Time')
|
503 |
-
fig.update_xaxes(title_text='Time')
|
504 |
-
fig.update_yaxes(title_text='Tweet Count')
|
505 |
-
fig.update_layout(xaxis_rangeslider_visible=True)
|
506 |
-
|
507 |
-
# Specify custom dimensions for the chart
|
508 |
-
st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
|
509 |
-
|
510 |
-
## ............................................... ##
|
511 |
-
# Group by Sentiment columns and get the count
|
512 |
-
sentiment_counts = df[['Sentiment Distilbert', 'Sentiment VADER', 'Sentiment TextBlob']].apply(lambda x: x.value_counts()).T
|
513 |
-
|
514 |
-
# Reset index to get Sentiment as a column
|
515 |
-
sentiment_counts = sentiment_counts.reset_index()
|
516 |
-
|
517 |
-
# Melt the DataFrame for easier plotting
|
518 |
-
sentiment_counts = pd.melt(sentiment_counts, id_vars='index', var_name='Sentiment', value_name='Count')
|
519 |
-
|
520 |
-
# Create the plot
|
521 |
-
fig = px.bar(sentiment_counts, x='Sentiment', y='Count', color='index', barmode='group', title='Total Tweet per Sentiment')
|
522 |
-
|
523 |
-
# Specify custom dimensions for the chart
|
524 |
-
st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
|
525 |
-
|
526 |
-
with col2:
|
527 |
-
## ............................................... ##
|
528 |
-
# Create a DataFrame to count the number of tweets by language
|
529 |
-
language_counts = df['Language'].value_counts().reset_index()
|
530 |
-
language_counts.columns = ['Language', 'Tweet Count']
|
531 |
-
|
532 |
-
# Create an attractive Plotly bar chart
|
533 |
-
fig = px.bar(language_counts, x='Language', y='Tweet Count', text='Tweet Count', title='Total Tweet by Language')
|
534 |
-
fig.update_xaxes(title_text='Language')
|
535 |
-
fig.update_yaxes(title_text='Total Tweet')
|
536 |
-
|
537 |
-
# Specify custom dimensions for the chart
|
538 |
-
st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
|
539 |
-
|
540 |
-
## ............................................... ##
|
541 |
-
# Create wordcloud
|
542 |
-
create_wordcloud(df)
|
543 |
-
|
544 |
-
## ............................................... ##
|
545 |
-
# Show dataset information
|
546 |
-
pr = ProfileReport(df)
|
547 |
-
st_profile_report(pr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|