Update pages/News_Scrapping.py
Browse files- pages/News_Scrapping.py +249 -107
pages/News_Scrapping.py
CHANGED
@@ -1,29 +1,64 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from ydata_profiling import ProfileReport
|
|
|
|
|
|
|
3 |
import streamlit as st
|
4 |
-
|
5 |
-
|
|
|
6 |
from langdetect import detect
|
|
|
|
|
7 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
8 |
from deep_translator import GoogleTranslator
|
9 |
-
import
|
10 |
-
import
|
11 |
-
import
|
12 |
-
from
|
|
|
|
|
13 |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
14 |
from textblob import TextBlob
|
|
|
|
|
15 |
from urllib.parse import urlparse
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
nltk.download('punkt')
|
18 |
|
19 |
## ............................................... ##
|
20 |
# Set page configuration (Call this once and make changes as needed)
|
21 |
-
st.set_page_config(page_title='News Scrapping', layout='wide', page_icon=':
|
22 |
|
23 |
with st.container():
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
|
28 |
## ............................................... ##
|
29 |
# Set up logging
|
@@ -140,34 +175,89 @@ def translate_text(text, source='auto', target='en'):
|
|
140 |
logging.error(f"Translation error: {str(e)}")
|
141 |
return text
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
## ............................................... ##
|
144 |
with st.container():
|
145 |
-
|
146 |
-
|
147 |
|
148 |
-
|
149 |
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
|
160 |
## ............................................... ##
|
161 |
with st.container():
|
162 |
-
|
163 |
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
|
172 |
## ............................................... ##
|
173 |
# Create a variable to track whether the data has been processed
|
@@ -188,84 +278,136 @@ model, tokenizer = get_models_and_tokenizers()
|
|
188 |
|
189 |
## ............................................... ##
|
190 |
with st.container():
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
'Text': [text], 'Summary': [summary]})
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
## ............................................... ##
|
247 |
-
# Add a button to download the data as a CSV file
|
248 |
-
if data_processed:
|
249 |
-
st.markdown("### Download Processed Data as CSV")
|
250 |
-
st.write("Click the button below to download the processed data as a CSV file.")
|
251 |
-
|
252 |
-
# Create a downloadable link
|
253 |
-
csv_data = df.to_csv(index=False).encode()
|
254 |
-
st.download_button(
|
255 |
-
label="Download CSV",
|
256 |
-
data=csv_data,
|
257 |
-
file_name="processed_data.csv",
|
258 |
-
)
|
259 |
-
|
260 |
-
with st.expander("See Table"):
|
261 |
## ............................................... ##
|
262 |
-
#
|
263 |
if data_processed:
|
264 |
-
st.
|
265 |
-
|
266 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
## ............................................... ##
|
268 |
# Display processed data
|
269 |
-
|
270 |
-
|
271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
%%writefile app.py
|
2 |
+
# News Information and data article
|
3 |
+
from newspaper import Article, Config
|
4 |
+
from gnews import GNews
|
5 |
+
|
6 |
+
# Data Analysis and Profiling
|
7 |
+
import pandas as pd
|
8 |
from ydata_profiling import ProfileReport
|
9 |
+
from st_aggrid import AgGrid, GridOptionsBuilder
|
10 |
+
|
11 |
+
# Streamlit for Building the Dashboard
|
12 |
import streamlit as st
|
13 |
+
from streamlit_pandas_profiling import st_profile_report
|
14 |
+
|
15 |
+
# Language Detection
|
16 |
from langdetect import detect
|
17 |
+
|
18 |
+
# NLP and Text Processing
|
19 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
20 |
from deep_translator import GoogleTranslator
|
21 |
+
import nltk
|
22 |
+
from nltk.corpus import stopwords
|
23 |
+
from nltk.stem import WordNetLemmatizer
|
24 |
+
from bs4 import BeautifulSoup
|
25 |
+
|
26 |
+
# Sentiment Analysis
|
27 |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
28 |
from textblob import TextBlob
|
29 |
+
|
30 |
+
# URL Parsing
|
31 |
from urllib.parse import urlparse
|
32 |
+
|
33 |
+
# Data Visualization
|
34 |
+
import plotly.express as px
|
35 |
+
import matplotlib.pyplot as plt
|
36 |
+
|
37 |
+
# Word Cloud Generation
|
38 |
+
from wordcloud import WordCloud
|
39 |
+
|
40 |
+
# Other Libraries
|
41 |
+
import torch
|
42 |
+
import requests
|
43 |
+
import subprocess
|
44 |
+
import logging
|
45 |
+
import json
|
46 |
+
import re
|
47 |
+
import os
|
48 |
+
|
49 |
+
# NLTK Data Download
|
50 |
+
nltk.download('stopwords')
|
51 |
+
nltk.download('wordnet')
|
52 |
nltk.download('punkt')
|
53 |
|
54 |
## ............................................... ##
|
55 |
# Set page configuration (Call this once and make changes as needed)
|
56 |
+
st.set_page_config(page_title='News Scrapping', layout='wide', page_icon=':newspaper:')
|
57 |
|
58 |
with st.container():
|
59 |
+
# Initialize Streamlit app
|
60 |
+
st.title('News Article Scrapping')
|
61 |
+
st.write("Created by Bayhaqy")
|
62 |
|
63 |
## ............................................... ##
|
64 |
# Set up logging
|
|
|
175 |
logging.error(f"Translation error: {str(e)}")
|
176 |
return text
|
177 |
|
178 |
+
## ............................................... ##
|
179 |
+
# Function to preprocess the data
|
180 |
+
@st.cache_data
|
181 |
+
def preprocessing_data(df):
|
182 |
+
# Remove duplicates
|
183 |
+
df = df.drop_duplicates(subset='Translation')
|
184 |
+
|
185 |
+
# Reset the index to add the date column
|
186 |
+
df.reset_index(inplace=True,drop=True)
|
187 |
+
|
188 |
+
# Function to clean and preprocess text
|
189 |
+
def clean_text(text):
|
190 |
+
# Remove URLs
|
191 |
+
text = re.sub(r'http\S+', '', text)
|
192 |
+
|
193 |
+
# Convert to lowercase
|
194 |
+
text = text.lower()
|
195 |
+
|
196 |
+
# Remove non-alphanumeric characters
|
197 |
+
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
198 |
+
|
199 |
+
# Tokenize text
|
200 |
+
words = nltk.word_tokenize(text)
|
201 |
+
|
202 |
+
# Remove stopwords
|
203 |
+
stop_words = set(stopwords.words('english'))
|
204 |
+
words = [word for word in words if word not in stop_words]
|
205 |
+
|
206 |
+
# Lemmatize words
|
207 |
+
lemmatizer = WordNetLemmatizer()
|
208 |
+
words = [lemmatizer.lemmatize(word) for word in words]
|
209 |
+
|
210 |
+
return ' '.join(words)
|
211 |
+
|
212 |
+
# Apply the clean_text function to the "Translation" column
|
213 |
+
df['Cleaned Translation'] = df['Translation'].apply(clean_text)
|
214 |
+
|
215 |
+
return df
|
216 |
+
|
217 |
+
## ............................................... ##
|
218 |
+
# Function to create a Word Cloud
|
219 |
+
@st.cache_data
|
220 |
+
def create_wordcloud(df):
|
221 |
+
# Combine all text
|
222 |
+
text = ' '.join(df['Cleaned Translation'])
|
223 |
+
|
224 |
+
# Create a Word Cloud
|
225 |
+
wordcloud = WordCloud(width=700, height=400, max_words=80).generate(text)
|
226 |
+
|
227 |
+
# Convert the word cloud to an image
|
228 |
+
wordcloud_image = wordcloud.to_image()
|
229 |
+
|
230 |
+
# Display the Word Cloud using st.image
|
231 |
+
st.image(wordcloud_image, use_column_width=True)
|
232 |
+
|
233 |
## ............................................... ##
|
234 |
with st.container():
|
235 |
+
# Input search parameters
|
236 |
+
search_term = st.text_input('Enter a search term :', 'Indonesia')
|
237 |
|
238 |
+
col1, col2, col3 = st.columns(3)
|
239 |
|
240 |
+
with col1:
|
241 |
+
period = st.text_input('Enter a news period :', '7d')
|
242 |
+
max_results = st.number_input('Maximum number of results :', min_value=1, value=10)
|
243 |
+
with col2:
|
244 |
+
country = st.text_input('Country :', 'Indonesia')
|
245 |
+
language = st.text_input('Language :', 'indonesian')
|
246 |
+
with col3:
|
247 |
+
start_date = st.date_input('Start Date :', pd.to_datetime('2023-01-01'))
|
248 |
+
end_date = st.date_input('End Date :', pd.to_datetime('2023-12-01'))
|
249 |
|
250 |
## ............................................... ##
|
251 |
with st.container():
|
252 |
+
col1, col2 = st.columns(2)
|
253 |
|
254 |
+
with col1:
|
255 |
+
# Checkbox options for different processing steps
|
256 |
+
include_translation = st.checkbox("Include Translation", value=True)
|
257 |
+
include_sentiment_analysis = st.checkbox("Include Sentiment Analysis", value=True)
|
258 |
+
with col2:
|
259 |
+
include_sentiment_vader = st.checkbox("Include VADER Sentiment Analysis", value=True)
|
260 |
+
include_sentiment_textblob = st.checkbox("Include TextBlob Sentiment Analysis", value=True)
|
261 |
|
262 |
## ............................................... ##
|
263 |
# Create a variable to track whether the data has been processed
|
|
|
278 |
|
279 |
## ............................................... ##
|
280 |
with st.container():
|
281 |
+
# Fetch news and process articles
|
282 |
+
if st.button('Fetch and Process News'):
|
283 |
+
# Your news retrieval code
|
284 |
+
google_news = GNews()
|
285 |
+
google_news.period = period # News from last 7 days
|
286 |
+
google_news.max_results = max_results # number of responses across a keyword
|
287 |
+
google_news.country = country # News from a specific country
|
288 |
+
google_news.language = language # News in a specific language
|
289 |
+
#google_news.exclude_websites = ['yahoo.com', 'cnn.com'] # Exclude news from specific website i.e Yahoo.com and CNN.com
|
290 |
+
google_news.start_date = (start_date.year, start_date.month, start_date.day) # Search from 1st Jan 2023
|
291 |
+
google_news.end_date = (end_date.year, end_date.month, end_date.day) # Search until 1st Dec 2023
|
292 |
+
|
293 |
+
news = google_news.get_news(search_term)
|
294 |
+
|
295 |
+
## ............................................... ##,
|
296 |
+
# Progress bar for fetching and processing news
|
297 |
+
progress_bar = st.progress(0)
|
298 |
+
total_news = len(news)
|
299 |
+
|
300 |
+
# Your news retrieval code (assuming 'news' is a list of article URLs)
|
301 |
+
#for x in news:
|
302 |
+
for idx, x in enumerate(news):
|
303 |
+
result = process_article(x['url'], _config=config)
|
304 |
+
if result is not None:
|
305 |
+
publish_date, language, url, source_url, title, authors, keywords, text, summary = result
|
306 |
+
|
307 |
+
# Insert to dataframe
|
308 |
+
temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Source_Url': [source_url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
|
309 |
'Text': [text], 'Summary': [summary]})
|
310 |
+
df = pd.concat([df, temp_df], ignore_index=True)
|
311 |
+
|
312 |
+
# Convert 'Publish_Date' to DatetimeIndex
|
313 |
+
df['Publish_Date'] = pd.to_datetime(df['Publish_Date'])
|
314 |
+
|
315 |
+
# Update the progress bar
|
316 |
+
progress = (idx + 1) / total_news
|
317 |
+
progress_bar.progress(progress)
|
318 |
+
|
319 |
+
# Conditionally apply translation function to the 'Translation' column
|
320 |
+
if include_translation:
|
321 |
+
df['Translation'] = df.apply(lambda row: translate_text((row['Title'] + ' | ' + row['Summary']), source=row['Language'], target='en'), axis=1)
|
322 |
+
|
323 |
+
# Preprocessing Data
|
324 |
+
df = preprocessing_data(df)
|
325 |
+
|
326 |
+
# Conditionally apply sentiment analysis function to the 'Translation' column
|
327 |
+
if include_sentiment_analysis:
|
328 |
+
df[['Fake Check', 'Sentiment Distilbert']] = df['Translation'].apply(lambda text: pd.Series(analyze_sentiment_distilbert(text, model, tokenizer)))
|
329 |
|
330 |
+
|
331 |
+
# Conditionally apply VADER sentiment analysis to the 'Translation' column
|
332 |
+
if include_sentiment_vader:
|
333 |
+
df['Sentiment VADER'] = df['Translation'].apply(analyze_sentiment_vader)
|
334 |
+
|
335 |
+
# Conditionally apply TextBlob sentiment analysis to the 'Translation' column
|
336 |
+
if include_sentiment_textblob:
|
337 |
+
df['Sentiment TextBlob'] = df['Translation'].apply(analyze_sentiment_textblob)
|
338 |
+
|
339 |
+
# Set data_processed to True when the data has been successfully processed
|
340 |
+
data_processed = True
|
341 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
342 |
## ............................................... ##
|
343 |
+
# Add a button to download the data as a CSV file
|
344 |
if data_processed:
|
345 |
+
st.markdown("### Download Processed Data as CSV")
|
346 |
+
st.write("Click the button below to download the processed data as a CSV file.")
|
347 |
+
|
348 |
+
# Create a downloadable link
|
349 |
+
csv_data = df.to_csv(index=False).encode()
|
350 |
+
st.download_button(
|
351 |
+
label="Download CSV",
|
352 |
+
data=csv_data,
|
353 |
+
file_name="processed_data.csv",
|
354 |
+
)
|
355 |
+
|
356 |
+
## ............................................... ##
|
357 |
+
with st.expander("See for Table"):
|
358 |
+
# Display processed data
|
359 |
+
if data_processed:
|
360 |
+
#st.dataframe(df)
|
361 |
+
AgGrid(df, height=400)
|
362 |
+
|
363 |
## ............................................... ##
|
364 |
# Display processed data
|
365 |
+
with st.expander("See for Exploratory Data Analysis"):
|
366 |
+
if data_processed:
|
367 |
+
col1, col2 = st.columns(2)
|
368 |
+
with col1:
|
369 |
+
## ............................................... ##
|
370 |
+
# Create a DataFrame to count the number of tweets by Fake Check
|
371 |
+
FakeCheck_counts = df['Fake Check'].value_counts().reset_index()
|
372 |
+
FakeCheck_counts.columns = ['Fake Check', 'News Count']
|
373 |
+
fig = px.bar(FakeCheck_counts, x='Fake Check', y='News Count', text='News Count', title='Total News by Fake Check')
|
374 |
+
st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
|
375 |
+
|
376 |
+
## ............................................... ##
|
377 |
+
# Create wordcloud
|
378 |
+
try:
|
379 |
+
st.write('WordCloud for News')
|
380 |
+
create_wordcloud(df)
|
381 |
+
except Exception as e:
|
382 |
+
logging.error(f" Column Translation Not Available : {str(e)}")
|
383 |
+
|
384 |
+
## ............................................... ##
|
385 |
+
|
386 |
+
with col2:
|
387 |
+
## ............................................... ##
|
388 |
+
# Create a DataFrame to count the number of News by language
|
389 |
+
language_counts = df['Language'].value_counts().reset_index()
|
390 |
+
language_counts.columns = ['Language', 'News Count']
|
391 |
+
fig = px.bar(language_counts, x='Language', y='News Count', text='News Count', title='Total News by Language')
|
392 |
+
st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
|
393 |
+
|
394 |
+
## ............................................... ##
|
395 |
+
# Group by Sentiment columns and get the count
|
396 |
+
try:
|
397 |
+
sentiment_counts = df[['Sentiment Distilbert', 'Sentiment VADER', 'Sentiment TextBlob']].apply(lambda x: x.value_counts()).T
|
398 |
+
sentiment_counts = sentiment_counts.reset_index()
|
399 |
+
sentiment_counts = pd.melt(sentiment_counts, id_vars='index', var_name='Sentiment', value_name='Count')
|
400 |
+
fig = px.bar(sentiment_counts, x='Sentiment', y='Count', color='index', barmode='group', title='Total News per Sentiment')
|
401 |
+
st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
|
402 |
+
|
403 |
+
except Exception as e:
|
404 |
+
logging.error(f" Column Sentiment Not Available : {str(e)}")
|
405 |
+
|
406 |
+
## ............................................... ##
|
407 |
+
|
408 |
+
with st.expander("See for Analysis with ydata-profiling"):
|
409 |
+
## ............................................... ##
|
410 |
+
# Display processed data
|
411 |
+
if data_processed:
|
412 |
+
pr = ProfileReport(df)
|
413 |
+
st_profile_report(pr)
|