Bayhaqy commited on
Commit
30dc1a4
Β·
1 Parent(s): dd13e2f

Delete pages/πŸ“±_X_Scrapping.py

Browse files
Files changed (1) hide show
  1. pages/πŸ“±_X_Scrapping.py +0 -547
pages/πŸ“±_X_Scrapping.py DELETED
@@ -1,547 +0,0 @@
1
- # Data Analysis and Profiling
2
- import pandas as pd
3
- from ydata_profiling import ProfileReport
4
- from streamlit_pandas_profiling import st_profile_report
5
-
6
- # Streamlit for Building the Dashboard
7
- import streamlit as st
8
- import streamlit_pandas_profiling
9
-
10
- # Language Detection
11
- from langdetect import detect
12
-
13
- # NLP and Text Processing
14
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
15
- from deep_translator import GoogleTranslator
16
- import nltk
17
- from nltk.corpus import stopwords
18
- from nltk.stem import WordNetLemmatizer
19
- from bs4 import BeautifulSoup
20
-
21
- # Sentiment Analysis
22
- from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
23
- from textblob import TextBlob
24
-
25
- # URL Parsing
26
- from urllib.parse import urlparse
27
-
28
- # Data Visualization
29
- import plotly.express as px
30
- import matplotlib.pyplot as plt
31
-
32
- # Word Cloud Generation
33
- from wordcloud import WordCloud
34
-
35
- # Other Libraries
36
- import torch
37
- import requests
38
- import subprocess
39
- import logging
40
- import re
41
- import os
42
-
43
- # NLTK Data Download
44
- nltk.download('wordnet')
45
- nltk.download('punkt')
46
-
47
- ## ............................................... ##
48
- # Set page configuration (Call this once and make changes as needed)
49
- st.set_page_config(page_title='(Tweet) X Scrapper Dashboard', layout='wide', page_icon=':rocket:')
50
-
51
-
52
- ## ............................................... ##
53
- with st.container():
54
- # Define Streamlit app title and introduction
55
- st.title("(Tweet) X Scrapper Dashboard")
56
- st.write("Created by Bayhaqy")
57
-
58
- # Sidebar content
59
- st.sidebar.subheader("About the app")
60
- st.sidebar.info("This app allows you to get data, analysis and prediction with the (Tweet) X Scrapper tool.")
61
-
62
- url = "https://blogs.bayhaqy.my.id/2023/10/auth-token-twitter.html"
63
- st.sidebar.markdown("check this [link](%s) for guides on how to get your own X Auth Token" % url)
64
-
65
- st.sidebar.write("\n\n")
66
- st.sidebar.markdown("**Please contact me if you have any questions**")
67
- st.sidebar.write("\n\n")
68
- st.sidebar.divider()
69
- st.sidebar.markdown("Β© 2023 (Tweet) X Scrapper Dashboard")
70
-
71
- ## ............................................... ##
72
- # Function to install Node.js
73
- @st.cache_data
74
- def install_nodejs():
75
- try:
76
- # Check if Node.js is already installed by attempting to get its version.
77
- node_major_version = int(subprocess.check_output(['node', '-v']).decode("utf-8").split('.')[0][1:])
78
- except FileNotFoundError:
79
- # If 'node' command is not found, it means Node.js is not installed.
80
- node_major_version = 0
81
-
82
- if node_major_version < 20:
83
- st.markdown('Update OS')
84
-
85
- check_os1 = subprocess.check_output(['lsb_release', '-a']).decode("utf-8")
86
- st.markdown(f'OS version: {check_os1}')
87
- check_os2 = subprocess.check_output(['uname', '-r']).decode("utf-8")
88
- st.markdown(f'OS Kernel version: {check_os2}')
89
- check_ver = subprocess.check_output(['python', '--version']).decode("utf-8")
90
- st.markdown(f'Python version: {check_ver}')
91
-
92
- subprocess.check_call(['sudo', 'apt-get', 'update'])
93
-
94
- st.markdown('Download Files Requirement for Nodesource')
95
- subprocess.check_call(['sudo', 'apt-get', 'install', '-y', 'ca-certificates', 'curl', 'gnupg'])
96
- subprocess.check_call(['sudo', 'mkdir', '-p', '/etc/apt/keyrings'])
97
- subprocess.check_call(f'curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg', shell=True)
98
-
99
- NODE_MAJOR = 20
100
- node_source_entry = f"deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_{NODE_MAJOR}.x nodistro main"
101
- subprocess.check_call(f'echo "{node_source_entry}" | sudo tee /etc/apt/sources.list.d/nodesource.list', shell=True)
102
-
103
- st.markdown('Install Node.js')
104
- subprocess.check_call(['sudo', 'apt-get', 'update'])
105
- subprocess.check_call(['sudo', 'apt-get', 'install', 'nodejs', '-y'])
106
-
107
- result = subprocess.check_output(['node', '-v']).decode("utf-8")
108
- st.markdown(f'Node.js version: {result}')
109
- else:
110
- st.markdown('Node.js version already installed')
111
- result = subprocess.check_output(['node', '-v']).decode("utf-8")
112
- st.markdown(f'Node.js version already updated to {result}')
113
-
114
- ## ............................................... ##
115
- # Function to run tweet-harvest
116
- @st.cache_data
117
- def run_X_scrapping(search_keyword,from_date,to_date,limit,delay,token,filename):
118
- # Run tweet-harvest with the provided parameters
119
- #st.markdown('Check Tweet')
120
- command = f'npx --yes tweet-harvest@latest -s "{search_keyword}" -f "{from_date}" -t "{to_date}" -l {limit} -d {delay} --token "{token}" -o "{filename}"'
121
- try:
122
- result = subprocess.run(command, shell=True, capture_output=True, text=True, check=True)
123
- st.markdown("Command executed successfully.")
124
- st.markdown(result.stdout) # Display the standard output, give comment if you don't want to see
125
- except subprocess.CalledProcessError as e:
126
- st.markdown("Error: The command returned a non-zero exit status.")
127
- st.markdown("Error message:", e)
128
- st.markdown(f'Standard output: {e.stdout}')
129
- st.markdown(f'Standard error: {e.stderr}')
130
-
131
- ## ............................................... ##
132
- # Function for get model and tokenize
133
- @st.cache_resource
134
- def get_models_and_tokenizers():
135
- model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
136
- tokenizer = AutoTokenizer.from_pretrained(model_name)
137
- model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
138
- #model.eval()
139
-
140
- return model, tokenizer
141
-
142
- ## ............................................... ##
143
- # Function for sentiment analysis
144
- @st.cache_resource
145
- def analyze_sentiment_distilbert(text, _model, _tokenizer):
146
- try:
147
- tokens_info = _tokenizer(text, truncation=True, return_tensors="pt")
148
- with torch.no_grad():
149
- raw_predictions = _model(**tokens_info).logits
150
-
151
- predicted_class_id = raw_predictions.argmax().item()
152
- predict = _model.config.id2label[predicted_class_id]
153
-
154
- softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100)
155
- if (softmaxed > 70):
156
- status = 'Not trust'
157
- elif (softmaxed > 40):
158
- status = 'Not sure'
159
- else:
160
- status = 'Trust'
161
- return status, predict
162
-
163
- except Exception as e:
164
- logging.error(f"Sentiment analysis error: {str(e)}")
165
- return 'N/A', 'N/A'
166
-
167
- ## ............................................... ##
168
- # Function for sentiment analysis using VADER
169
- @st.cache_resource
170
- def analyze_sentiment_vader(text):
171
- analyzer = SentimentIntensityAnalyzer()
172
- sentiment = analyzer.polarity_scores(text)
173
- compound_score = sentiment['compound']
174
- if compound_score >= 0.05:
175
- return 'Positive'
176
- elif compound_score <= -0.05:
177
- return 'Negative'
178
- else:
179
- return 'Neutral'
180
-
181
- ## ............................................... ##
182
- # Function for sentiment analysis using TextBlob
183
- @st.cache_resource
184
- def analyze_sentiment_textblob(text):
185
- analysis = TextBlob(text)
186
- polarity = analysis.sentiment.polarity
187
- if polarity > 0:
188
- return 'Positive'
189
- elif polarity < 0:
190
- return 'Negative'
191
- else:
192
- return 'Neutral'
193
-
194
- ## ............................................... ##
195
- # Function for translation
196
- @st.cache_data
197
- def translate_text(text, source='auto', target='en'):
198
- try:
199
- if source != target:
200
- text = GoogleTranslator(source=source, target=target).translate(text)
201
- return text
202
-
203
- except Exception as e:
204
- logging.error(f"Translation error: {str(e)}")
205
- return text
206
-
207
- ## ............................................... ##
208
- # Function for Load and Transform Data
209
- @st.cache_data
210
- def selection_data(filename):
211
- file_path = f"tweets-data/{filename}"
212
- df = pd.read_csv(file_path, delimiter=";")
213
-
214
-
215
- # Rename columns
216
- column_mapping = {
217
- 'created_at': 'Created Date',
218
- 'user_id_str': 'User ID',
219
- 'username': 'Username',
220
- 'full_text': 'Tweet',
221
- 'tweet_url': 'Tweet URL',
222
- 'id_str': 'Tweet ID',
223
- 'conversation_id_str': 'Conversation ID',
224
- 'lang': 'App Language',
225
- 'quote_count': 'Quote Count',
226
- 'reply_count': 'Reply Count',
227
- 'retweet_count': 'Retweet Count',
228
- 'favorite_count': 'Favorite Count',
229
- }
230
-
231
- df = df.rename(columns=column_mapping)
232
-
233
- # Add a new column for detected language
234
- df['Detect Language'] = df['Tweet'].apply(lambda tweet: detect(tweet))
235
-
236
- # Mapping language codes to country names
237
- language_to_country = {
238
- 'af': 'South Africa',
239
- 'ar': 'Arabic',
240
- 'bg': 'Bulgaria',
241
- 'bn': 'Bangladesh',
242
- 'ca': 'Catalan',
243
- 'cs': 'Czech',
244
- 'cy': 'Welsh',
245
- 'da': 'Danish',
246
- 'de': 'German',
247
- 'el': 'Greek',
248
- 'en': 'English',
249
- 'es': 'Spanish',
250
- 'et': 'Estonian',
251
- 'fa': 'Persian',
252
- 'fi': 'Finnish',
253
- 'fr': 'French',
254
- 'gu': 'Gujarati',
255
- 'he': 'Hebrew',
256
- 'hi': 'Hindi',
257
- 'hr': 'Croatian',
258
- 'hu': 'Hungarian',
259
- 'id': 'Indonesian',
260
- 'it': 'Italian',
261
- 'ja': 'Japanese',
262
- 'kn': 'Kannada',
263
- 'ko': 'Korean',
264
- 'lt': 'Lithuanian',
265
- 'lv': 'Latvian',
266
- 'mk': 'Macedonian',
267
- 'ml': 'Malayalam',
268
- 'mr': 'Marathi',
269
- 'ne': 'Nepali',
270
- 'nl': 'Dutch',
271
- 'no': 'Norwegian',
272
- 'pa': 'Punjabi',
273
- 'pl': 'Polish',
274
- 'pt': 'Portuguese',
275
- 'ro': 'Romanian',
276
- 'ru': 'Russian',
277
- 'sk': 'Slovak',
278
- 'sl': 'Slovenian',
279
- 'so': 'Somali',
280
- 'sq': 'Albanian',
281
- 'sv': 'Swedish',
282
- 'sw': 'Swahili',
283
- 'ta': 'Tamil',
284
- 'te': 'Telugu',
285
- 'th': 'Thai',
286
- 'tl': 'Tagalog',
287
- 'tr': 'Turkish',
288
- 'uk': 'Ukrainian',
289
- 'ur': 'Urdu',
290
- 'vi': 'Vietnamese',
291
- 'zh-cn': 'Simplified Chinese',
292
- 'zh-tw': 'Traditional Chinese'
293
- }
294
-
295
- # Add 'Country' column to df
296
- df['Language'] = df['Detect Language'].map(language_to_country)
297
-
298
- # Sort columns
299
- desired_columns = ['Created Date', 'User ID', 'Username', 'Tweet', 'Language', 'Detect Language', 'App Language', 'Tweet URL', 'Tweet ID', 'Conversation ID', 'Quote Count', 'Reply Count', 'Retweet Count', 'Favorite Count']
300
- df = df[desired_columns]
301
-
302
- # Set data types
303
- data_types = {
304
- 'Created Date': 'datetime64[ns]',
305
- 'User ID': 'int64',
306
- 'Username': 'object',
307
- 'Tweet': 'object',
308
- 'Language': 'object',
309
- 'Detect Language': 'object',
310
- 'App Language': 'object',
311
- 'Tweet URL': 'object',
312
- 'Tweet ID': 'int64',
313
- 'Conversation ID': 'int64',
314
- 'Quote Count': 'int64',
315
- 'Reply Count': 'int64',
316
- 'Retweet Count': 'int64',
317
- 'Favorite Count': 'int64',
318
- }
319
-
320
- df = df.astype(data_types)
321
-
322
- return df
323
-
324
- ## ............................................... ##
325
- # Function to preprocess the data
326
- @st.cache_data
327
- def preprocessing_data(df):
328
- # Remove duplicates
329
- df = df.drop_duplicates(subset='Translation')
330
-
331
- # Function to clean and preprocess text
332
- def clean_text(text):
333
- # Remove mentions (e.g., @username)
334
- text = re.sub(r'@[\w]+', '', text)
335
-
336
- # Remove URLs
337
- text = re.sub(r'http\S+', '', text)
338
-
339
- # Remove HTML tags
340
- text = BeautifulSoup(text, 'html.parser').get_text()
341
-
342
- # Convert to lowercase
343
- text = text.lower()
344
-
345
- # Remove non-alphanumeric characters
346
- text = re.sub(r'[^a-zA-Z\s]', '', text)
347
-
348
- # Tokenize text
349
- words = nltk.word_tokenize(text)
350
-
351
- # Remove stopwords
352
- stop_words = set(stopwords.words('english'))
353
- words = [word for word in words if word not in stop_words]
354
-
355
- # Lemmatize words
356
- lemmatizer = WordNetLemmatizer()
357
- words = [lemmatizer.lemmatize(word) for word in words]
358
-
359
- return ' '.join(words)
360
-
361
- # Apply the clean_text function to the "Translation" column
362
- df['Cleaned Translation'] = df['Translation'].apply(clean_text)
363
-
364
- return df
365
-
366
- ## ............................................... ##
367
- # Function to create a Word Cloud
368
- @st.cache_data
369
- def create_wordcloud(df):
370
- # Combine all text
371
- text = ' '.join(df['Cleaned Translation'])
372
-
373
- # Create a Word Cloud
374
- wordcloud = WordCloud(width=700, height=400, max_words=50).generate(text)
375
-
376
- # Convert the word cloud to an image
377
- wordcloud_image = wordcloud.to_image()
378
-
379
- # Display the Word Cloud using st.image
380
- st.write("word Cloud by Tweets")
381
- st.image(wordcloud_image, use_column_width=True)
382
-
383
- ## ............................................... ##
384
- # IMPORTANT: Cache the conversion to prevent computation on every rerun
385
- @st.cache_data
386
- def convert_df(df):
387
- return df.to_csv().encode('utf-8')
388
-
389
- ## ............................................... ##
390
- # Set up logging
391
- logging.basicConfig(filename='tweet_harvest.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
392
-
393
- ## ............................................... ##
394
- with st.container():
395
- # Input search parameters
396
- search_keyword = st.text_input("Enter search keyword", "Jakarta",)
397
-
398
- col1, col2 = st.columns(2)
399
-
400
- with col1:
401
- from_date = st.date_input('From Date :', pd.to_datetime('2023-01-01'))
402
- to_date = st.date_input('To Date :', pd.to_datetime('2023-12-01'))
403
- with col2:
404
- limit = st.number_input("Enter limit", min_value=10, value=10, max_value=100)
405
- delay = st.number_input("Enter delay in seconds", min_value=1, value=3)
406
-
407
- token = st.text_input("Enter your X Auth Token", type="password")
408
-
409
- ## ............................................... ##
410
- with st.container():
411
- col1, col2 = st.columns(2)
412
-
413
- with col1:
414
- # Checkbox options for different processing steps
415
- include_translation = st.checkbox("Include Translation", value=False)
416
- include_sentiment_analysis = st.checkbox("Include Sentiment Analysis", value=False)
417
- with col2:
418
- include_sentiment_vader = st.checkbox("Include VADER Sentiment Analysis", value=False)
419
- include_sentiment_textblob = st.checkbox("Include TextBlob Sentiment Analysis", value=False)
420
-
421
- ## ............................................... ##
422
- # Initialize to install node.js
423
- install_nodejs()
424
-
425
- # Initialize model and tokenizer
426
- model, tokenizer = get_models_and_tokenizers()
427
-
428
- # Create a variable to track whether the data has been processed
429
- data_processed = False
430
-
431
- ## ............................................... ##
432
- # Create a button to trigger tweet-harvest
433
- with st.container():
434
- if st.button("Run it"):
435
- # Format the dates as "DD-MM-YYYY"
436
- from_date = from_date.strftime("%d-%m-%Y")
437
- to_date = to_date.strftime("%d-%m-%Y")
438
-
439
- filename = 'tweets_data.csv'
440
-
441
- run_X_scrapping(search_keyword,from_date,to_date,limit,delay,token,filename)
442
-
443
- df = selection_data(filename)
444
-
445
- # Conditionally apply translation function to the 'Translation' column
446
- if include_translation:
447
- df['Translation'] = df.apply(lambda row: translate_text((row['Tweet']), source=row['Detect Language'], target='en'), axis=1)
448
- df = preprocessing_data(df)
449
-
450
- # Conditionally apply sentiment analysis function to the 'Translation' column
451
- if include_sentiment_analysis:
452
- df[['Fake Check', 'Sentiment Distilbert']] = df['Translation'].apply(lambda text: pd.Series(analyze_sentiment_distilbert(text, model, tokenizer))).apply(lambda x: x.str.title())
453
-
454
- # Conditionally apply VADER sentiment analysis to the 'Translation' column
455
- if include_sentiment_vader:
456
- df['Sentiment VADER'] = df['Translation'].apply(analyze_sentiment_vader)
457
-
458
- # Conditionally apply TextBlob sentiment analysis to the 'Translation' column
459
- if include_sentiment_textblob:
460
- df['Sentiment TextBlob'] = df['Translation'].apply(analyze_sentiment_textblob)
461
-
462
- # Set data_processed to True when the data has been successfully processed
463
- data_processed = True
464
-
465
- ## ............................................... ##
466
- # Add a button to download the data as a CSV file
467
- if data_processed:
468
- st.markdown("### Download Processed Data as CSV")
469
- st.write("Click the button below to download the processed data as a CSV file.")
470
- csv_data = convert_df(df)
471
-
472
- # Create a downloadable link
473
- st.download_button(
474
- label="Download data as CSV",
475
- data=csv_data,
476
- file_name='processed_data.csv',
477
- mime='text/csv',
478
- )
479
-
480
- with st.expander("See Table"):
481
- ## ............................................... ##
482
- # Display processed data
483
- st.dataframe(df)
484
-
485
- # Display processed data
486
- with st.expander("See EDA"):
487
- ## ............................................... ##
488
- # Create a Streamlit app
489
- st.subheader("Tweet Data Visualization")
490
-
491
- col1, col2 = st.columns(2)
492
- with col1:
493
- ## ............................................... ##
494
- # Create a new column with a count of 1 for each tweet
495
- df_date = pd.DataFrame(df['Created Date'])
496
- df_date['Tweet Count'] = 1
497
-
498
- # Resample the data per second and calculate the count
499
- data_resampled = df_date.resample('S', on='Created Date')['Tweet Count'].count().reset_index()
500
-
501
- # Create a time series plot with custom styling
502
- fig = px.line(data_resampled, x='Created Date', y='Tweet Count', title='Tweet Counts Over Time')
503
- fig.update_xaxes(title_text='Time')
504
- fig.update_yaxes(title_text='Tweet Count')
505
- fig.update_layout(xaxis_rangeslider_visible=True)
506
-
507
- # Specify custom dimensions for the chart
508
- st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
509
-
510
- ## ............................................... ##
511
- # Group by Sentiment columns and get the count
512
- sentiment_counts = df[['Sentiment Distilbert', 'Sentiment VADER', 'Sentiment TextBlob']].apply(lambda x: x.value_counts()).T
513
-
514
- # Reset index to get Sentiment as a column
515
- sentiment_counts = sentiment_counts.reset_index()
516
-
517
- # Melt the DataFrame for easier plotting
518
- sentiment_counts = pd.melt(sentiment_counts, id_vars='index', var_name='Sentiment', value_name='Count')
519
-
520
- # Create the plot
521
- fig = px.bar(sentiment_counts, x='Sentiment', y='Count', color='index', barmode='group', title='Total Tweet per Sentiment')
522
-
523
- # Specify custom dimensions for the chart
524
- st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
525
-
526
- with col2:
527
- ## ............................................... ##
528
- # Create a DataFrame to count the number of tweets by language
529
- language_counts = df['Language'].value_counts().reset_index()
530
- language_counts.columns = ['Language', 'Tweet Count']
531
-
532
- # Create an attractive Plotly bar chart
533
- fig = px.bar(language_counts, x='Language', y='Tweet Count', text='Tweet Count', title='Total Tweet by Language')
534
- fig.update_xaxes(title_text='Language')
535
- fig.update_yaxes(title_text='Total Tweet')
536
-
537
- # Specify custom dimensions for the chart
538
- st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
539
-
540
- ## ............................................... ##
541
- # Create wordcloud
542
- create_wordcloud(df)
543
-
544
- ## ............................................... ##
545
- # Show dataset information
546
- pr = ProfileReport(df)
547
- st_profile_report(pr)