Bayhaqy commited on
Commit
1445b61
·
1 Parent(s): 30dc1a4

Update pages/News_Scrapping.py

Browse files
Files changed (1) hide show
  1. pages/News_Scrapping.py +249 -107
pages/News_Scrapping.py CHANGED
@@ -1,29 +1,64 @@
1
- from streamlit_pandas_profiling import st_profile_report
 
 
 
 
 
 
2
  from ydata_profiling import ProfileReport
 
 
 
3
  import streamlit as st
4
- import pandas as pd
5
- from newspaper import Article, Config
 
6
  from langdetect import detect
 
 
7
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
8
  from deep_translator import GoogleTranslator
9
- import torch
10
- import requests
11
- import logging
12
- from gnews import GNews
 
 
13
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
14
  from textblob import TextBlob
 
 
15
  from urllib.parse import urlparse
16
- import nltk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  nltk.download('punkt')
18
 
19
  ## ............................................... ##
20
  # Set page configuration (Call this once and make changes as needed)
21
- st.set_page_config(page_title='News Scrapping', layout='wide', page_icon=':rocket:')
22
 
23
  with st.container():
24
- # Initialize Streamlit app
25
- st.title('News Article Scrapping')
26
- st.write("Created by Bayhaqy")
27
 
28
  ## ............................................... ##
29
  # Set up logging
@@ -140,34 +175,89 @@ def translate_text(text, source='auto', target='en'):
140
  logging.error(f"Translation error: {str(e)}")
141
  return text
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  ## ............................................... ##
144
  with st.container():
145
- # Input search parameters
146
- search_term = st.text_input('Enter a search term :', 'Indonesia')
147
 
148
- col1, col2, col3 = st.columns(3)
149
 
150
- with col1:
151
- period = st.text_input('Enter a news period :', '7d')
152
- max_results = st.number_input('Maximum number of results :', min_value=1, value=10)
153
- with col2:
154
- country = st.text_input('Country :', 'Indonesia')
155
- language = st.text_input('Language :', 'indonesian')
156
- with col3:
157
- start_date = st.date_input('Start Date :', pd.to_datetime('2023-01-01'))
158
- end_date = st.date_input('End Date :', pd.to_datetime('2023-12-01'))
159
 
160
  ## ............................................... ##
161
  with st.container():
162
- col1, col2 = st.columns(2)
163
 
164
- with col1:
165
- # Checkbox options for different processing steps
166
- include_translation = st.checkbox("Include Translation", value=False)
167
- include_sentiment_analysis = st.checkbox("Include Sentiment Analysis", value=False)
168
- with col2:
169
- include_sentiment_vader = st.checkbox("Include VADER Sentiment Analysis", value=False)
170
- include_sentiment_textblob = st.checkbox("Include TextBlob Sentiment Analysis", value=False)
171
 
172
  ## ............................................... ##
173
  # Create a variable to track whether the data has been processed
@@ -188,84 +278,136 @@ model, tokenizer = get_models_and_tokenizers()
188
 
189
  ## ............................................... ##
190
  with st.container():
191
- # Fetch news and process articles
192
- if st.button('Fetch and Process News'):
193
-
194
- # Your news retrieval code
195
- google_news = GNews()
196
-
197
- google_news.period = period # News from last 7 days
198
- google_news.max_results = max_results # number of responses across a keyword
199
- google_news.country = country # News from a specific country
200
- google_news.language = language # News in a specific language
201
- #google_news.exclude_websites = ['yahoo.com', 'cnn.com'] # Exclude news from specific website i.e Yahoo.com and CNN.com
202
- google_news.start_date = (start_date.year, start_date.month, start_date.day) # Search from 1st Jan 2023
203
- google_news.end_date = (end_date.year, end_date.month, end_date.day) # Search until 1st Dec 2023
204
-
205
- news = google_news.get_news(search_term)
206
-
207
- ## ............................................... ##,
208
- # Progress bar for fetching and processing news
209
- progress_bar = st.progress(0)
210
- total_news = len(news)
211
-
212
- # Your news retrieval code (assuming 'news' is a list of article URLs)
213
- #for x in news:
214
- for idx, x in enumerate(news):
215
- result = process_article(x['url'], _config=config)
216
- if result is not None:
217
- publish_date, language, url, source_url, title, authors, keywords, text, summary = result
218
- temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Source_Url': [source_url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
219
  'Text': [text], 'Summary': [summary]})
220
- df = pd.concat([df, temp_df], ignore_index=True)
221
-
222
- # Update the progress bar
223
- progress = (idx + 1) / total_news
224
- progress_bar.progress(progress)
225
-
226
- # Conditionally apply translation function to the 'Translation' column
227
- if include_translation:
228
- df['Translation'] = df.apply(lambda row: translate_text((row['Title'] + ' | ' + row['Summary']), source=row['Language'], target='en'), axis=1)
229
-
230
- # Conditionally apply sentiment analysis function to the 'Translation' column
231
- if include_sentiment_analysis:
232
- df[['Fake_Check', 'Sentiment_Distilbert']] = df['Translation'].apply(lambda text: pd.Series(analyze_sentiment_distilbert(text, model, tokenizer)))
 
 
 
 
 
 
233
 
234
-
235
- # Conditionally apply VADER sentiment analysis to the 'Translation' column
236
- if include_sentiment_vader:
237
- df['Sentiment_VADER'] = df['Translation'].apply(analyze_sentiment_vader)
238
-
239
- # Conditionally apply TextBlob sentiment analysis to the 'Translation' column
240
- if include_sentiment_textblob:
241
- df['Sentiment_TextBlob'] = df['Translation'].apply(analyze_sentiment_textblob)
242
-
243
- # Set data_processed to True when the data has been successfully processed
244
- data_processed = True
245
-
246
- ## ............................................... ##
247
- # Add a button to download the data as a CSV file
248
- if data_processed:
249
- st.markdown("### Download Processed Data as CSV")
250
- st.write("Click the button below to download the processed data as a CSV file.")
251
-
252
- # Create a downloadable link
253
- csv_data = df.to_csv(index=False).encode()
254
- st.download_button(
255
- label="Download CSV",
256
- data=csv_data,
257
- file_name="processed_data.csv",
258
- )
259
-
260
- with st.expander("See Table"):
261
  ## ............................................... ##
262
- # Display processed data
263
  if data_processed:
264
- st.dataframe(df)
265
-
266
- with st.expander("See EDA"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  ## ............................................... ##
268
  # Display processed data
269
- if data_processed:
270
- pr = ProfileReport(df)
271
- st_profile_report(pr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ %%writefile app.py
2
+ # News Information and data article
3
+ from newspaper import Article, Config
4
+ from gnews import GNews
5
+
6
+ # Data Analysis and Profiling
7
+ import pandas as pd
8
  from ydata_profiling import ProfileReport
9
+ from st_aggrid import AgGrid, GridOptionsBuilder
10
+
11
+ # Streamlit for Building the Dashboard
12
  import streamlit as st
13
+ from streamlit_pandas_profiling import st_profile_report
14
+
15
+ # Language Detection
16
  from langdetect import detect
17
+
18
+ # NLP and Text Processing
19
  from transformers import AutoModelForSequenceClassification, AutoTokenizer
20
  from deep_translator import GoogleTranslator
21
+ import nltk
22
+ from nltk.corpus import stopwords
23
+ from nltk.stem import WordNetLemmatizer
24
+ from bs4 import BeautifulSoup
25
+
26
+ # Sentiment Analysis
27
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
28
  from textblob import TextBlob
29
+
30
+ # URL Parsing
31
  from urllib.parse import urlparse
32
+
33
+ # Data Visualization
34
+ import plotly.express as px
35
+ import matplotlib.pyplot as plt
36
+
37
+ # Word Cloud Generation
38
+ from wordcloud import WordCloud
39
+
40
+ # Other Libraries
41
+ import torch
42
+ import requests
43
+ import subprocess
44
+ import logging
45
+ import json
46
+ import re
47
+ import os
48
+
49
+ # NLTK Data Download
50
+ nltk.download('stopwords')
51
+ nltk.download('wordnet')
52
  nltk.download('punkt')
53
 
54
  ## ............................................... ##
55
  # Set page configuration (Call this once and make changes as needed)
56
+ st.set_page_config(page_title='News Scrapping', layout='wide', page_icon=':newspaper:')
57
 
58
  with st.container():
59
+ # Initialize Streamlit app
60
+ st.title('News Article Scrapping')
61
+ st.write("Created by Bayhaqy")
62
 
63
  ## ............................................... ##
64
  # Set up logging
 
175
  logging.error(f"Translation error: {str(e)}")
176
  return text
177
 
178
+ ## ............................................... ##
179
+ # Function to preprocess the data
180
+ @st.cache_data
181
+ def preprocessing_data(df):
182
+ # Remove duplicates
183
+ df = df.drop_duplicates(subset='Translation')
184
+
185
+ # Reset the index to add the date column
186
+ df.reset_index(inplace=True,drop=True)
187
+
188
+ # Function to clean and preprocess text
189
+ def clean_text(text):
190
+ # Remove URLs
191
+ text = re.sub(r'http\S+', '', text)
192
+
193
+ # Convert to lowercase
194
+ text = text.lower()
195
+
196
+ # Remove non-alphanumeric characters
197
+ text = re.sub(r'[^a-zA-Z\s]', '', text)
198
+
199
+ # Tokenize text
200
+ words = nltk.word_tokenize(text)
201
+
202
+ # Remove stopwords
203
+ stop_words = set(stopwords.words('english'))
204
+ words = [word for word in words if word not in stop_words]
205
+
206
+ # Lemmatize words
207
+ lemmatizer = WordNetLemmatizer()
208
+ words = [lemmatizer.lemmatize(word) for word in words]
209
+
210
+ return ' '.join(words)
211
+
212
+ # Apply the clean_text function to the "Translation" column
213
+ df['Cleaned Translation'] = df['Translation'].apply(clean_text)
214
+
215
+ return df
216
+
217
+ ## ............................................... ##
218
+ # Function to create a Word Cloud
219
+ @st.cache_data
220
+ def create_wordcloud(df):
221
+ # Combine all text
222
+ text = ' '.join(df['Cleaned Translation'])
223
+
224
+ # Create a Word Cloud
225
+ wordcloud = WordCloud(width=700, height=400, max_words=80).generate(text)
226
+
227
+ # Convert the word cloud to an image
228
+ wordcloud_image = wordcloud.to_image()
229
+
230
+ # Display the Word Cloud using st.image
231
+ st.image(wordcloud_image, use_column_width=True)
232
+
233
  ## ............................................... ##
234
  with st.container():
235
+ # Input search parameters
236
+ search_term = st.text_input('Enter a search term :', 'Indonesia')
237
 
238
+ col1, col2, col3 = st.columns(3)
239
 
240
+ with col1:
241
+ period = st.text_input('Enter a news period :', '7d')
242
+ max_results = st.number_input('Maximum number of results :', min_value=1, value=10)
243
+ with col2:
244
+ country = st.text_input('Country :', 'Indonesia')
245
+ language = st.text_input('Language :', 'indonesian')
246
+ with col3:
247
+ start_date = st.date_input('Start Date :', pd.to_datetime('2023-01-01'))
248
+ end_date = st.date_input('End Date :', pd.to_datetime('2023-12-01'))
249
 
250
  ## ............................................... ##
251
  with st.container():
252
+ col1, col2 = st.columns(2)
253
 
254
+ with col1:
255
+ # Checkbox options for different processing steps
256
+ include_translation = st.checkbox("Include Translation", value=True)
257
+ include_sentiment_analysis = st.checkbox("Include Sentiment Analysis", value=True)
258
+ with col2:
259
+ include_sentiment_vader = st.checkbox("Include VADER Sentiment Analysis", value=True)
260
+ include_sentiment_textblob = st.checkbox("Include TextBlob Sentiment Analysis", value=True)
261
 
262
  ## ............................................... ##
263
  # Create a variable to track whether the data has been processed
 
278
 
279
  ## ............................................... ##
280
  with st.container():
281
+ # Fetch news and process articles
282
+ if st.button('Fetch and Process News'):
283
+ # Your news retrieval code
284
+ google_news = GNews()
285
+ google_news.period = period # News from last 7 days
286
+ google_news.max_results = max_results # number of responses across a keyword
287
+ google_news.country = country # News from a specific country
288
+ google_news.language = language # News in a specific language
289
+ #google_news.exclude_websites = ['yahoo.com', 'cnn.com'] # Exclude news from specific website i.e Yahoo.com and CNN.com
290
+ google_news.start_date = (start_date.year, start_date.month, start_date.day) # Search from 1st Jan 2023
291
+ google_news.end_date = (end_date.year, end_date.month, end_date.day) # Search until 1st Dec 2023
292
+
293
+ news = google_news.get_news(search_term)
294
+
295
+ ## ............................................... ##,
296
+ # Progress bar for fetching and processing news
297
+ progress_bar = st.progress(0)
298
+ total_news = len(news)
299
+
300
+ # Your news retrieval code (assuming 'news' is a list of article URLs)
301
+ #for x in news:
302
+ for idx, x in enumerate(news):
303
+ result = process_article(x['url'], _config=config)
304
+ if result is not None:
305
+ publish_date, language, url, source_url, title, authors, keywords, text, summary = result
306
+
307
+ # Insert to dataframe
308
+ temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Source_Url': [source_url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
309
  'Text': [text], 'Summary': [summary]})
310
+ df = pd.concat([df, temp_df], ignore_index=True)
311
+
312
+ # Convert 'Publish_Date' to DatetimeIndex
313
+ df['Publish_Date'] = pd.to_datetime(df['Publish_Date'])
314
+
315
+ # Update the progress bar
316
+ progress = (idx + 1) / total_news
317
+ progress_bar.progress(progress)
318
+
319
+ # Conditionally apply translation function to the 'Translation' column
320
+ if include_translation:
321
+ df['Translation'] = df.apply(lambda row: translate_text((row['Title'] + ' | ' + row['Summary']), source=row['Language'], target='en'), axis=1)
322
+
323
+ # Preprocessing Data
324
+ df = preprocessing_data(df)
325
+
326
+ # Conditionally apply sentiment analysis function to the 'Translation' column
327
+ if include_sentiment_analysis:
328
+ df[['Fake Check', 'Sentiment Distilbert']] = df['Translation'].apply(lambda text: pd.Series(analyze_sentiment_distilbert(text, model, tokenizer)))
329
 
330
+
331
+ # Conditionally apply VADER sentiment analysis to the 'Translation' column
332
+ if include_sentiment_vader:
333
+ df['Sentiment VADER'] = df['Translation'].apply(analyze_sentiment_vader)
334
+
335
+ # Conditionally apply TextBlob sentiment analysis to the 'Translation' column
336
+ if include_sentiment_textblob:
337
+ df['Sentiment TextBlob'] = df['Translation'].apply(analyze_sentiment_textblob)
338
+
339
+ # Set data_processed to True when the data has been successfully processed
340
+ data_processed = True
341
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  ## ............................................... ##
343
+ # Add a button to download the data as a CSV file
344
  if data_processed:
345
+ st.markdown("### Download Processed Data as CSV")
346
+ st.write("Click the button below to download the processed data as a CSV file.")
347
+
348
+ # Create a downloadable link
349
+ csv_data = df.to_csv(index=False).encode()
350
+ st.download_button(
351
+ label="Download CSV",
352
+ data=csv_data,
353
+ file_name="processed_data.csv",
354
+ )
355
+
356
+ ## ............................................... ##
357
+ with st.expander("See for Table"):
358
+ # Display processed data
359
+ if data_processed:
360
+ #st.dataframe(df)
361
+ AgGrid(df, height=400)
362
+
363
  ## ............................................... ##
364
  # Display processed data
365
+ with st.expander("See for Exploratory Data Analysis"):
366
+ if data_processed:
367
+ col1, col2 = st.columns(2)
368
+ with col1:
369
+ ## ............................................... ##
370
+ # Create a DataFrame to count the number of tweets by Fake Check
371
+ FakeCheck_counts = df['Fake Check'].value_counts().reset_index()
372
+ FakeCheck_counts.columns = ['Fake Check', 'News Count']
373
+ fig = px.bar(FakeCheck_counts, x='Fake Check', y='News Count', text='News Count', title='Total News by Fake Check')
374
+ st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
375
+
376
+ ## ............................................... ##
377
+ # Create wordcloud
378
+ try:
379
+ st.write('WordCloud for News')
380
+ create_wordcloud(df)
381
+ except Exception as e:
382
+ logging.error(f" Column Translation Not Available : {str(e)}")
383
+
384
+ ## ............................................... ##
385
+
386
+ with col2:
387
+ ## ............................................... ##
388
+ # Create a DataFrame to count the number of News by language
389
+ language_counts = df['Language'].value_counts().reset_index()
390
+ language_counts.columns = ['Language', 'News Count']
391
+ fig = px.bar(language_counts, x='Language', y='News Count', text='News Count', title='Total News by Language')
392
+ st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
393
+
394
+ ## ............................................... ##
395
+ # Group by Sentiment columns and get the count
396
+ try:
397
+ sentiment_counts = df[['Sentiment Distilbert', 'Sentiment VADER', 'Sentiment TextBlob']].apply(lambda x: x.value_counts()).T
398
+ sentiment_counts = sentiment_counts.reset_index()
399
+ sentiment_counts = pd.melt(sentiment_counts, id_vars='index', var_name='Sentiment', value_name='Count')
400
+ fig = px.bar(sentiment_counts, x='Sentiment', y='Count', color='index', barmode='group', title='Total News per Sentiment')
401
+ st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
402
+
403
+ except Exception as e:
404
+ logging.error(f" Column Sentiment Not Available : {str(e)}")
405
+
406
+ ## ............................................... ##
407
+
408
+ with st.expander("See for Analysis with ydata-profiling"):
409
+ ## ............................................... ##
410
+ # Display processed data
411
+ if data_processed:
412
+ pr = ProfileReport(df)
413
+ st_profile_report(pr)