Spaces:

Bayhaqy
/

Classification-News-Analysis-and-Prediction

Sleeping

App Files Files Community

Bayhaqy commited on Oct 18, 2023

Commit

1445b61

1 Parent(s): 30dc1a4

Update pages/News_Scrapping.py

Browse files

Files changed (1) hide show

pages/News_Scrapping.py +249 -107

pages/News_Scrapping.py CHANGED Viewed

@@ -1,29 +1,64 @@
-from streamlit_pandas_profiling import st_profile_report
 from ydata_profiling import ProfileReport
 import streamlit as st
-import pandas as pd
-from newspaper import Article, Config
 from langdetect import detect
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 from deep_translator import GoogleTranslator
-import torch
-import requests
-import logging
-from gnews import GNews
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 from textblob import TextBlob
 from urllib.parse import urlparse
-import nltk
 nltk.download('punkt')
 ## ............................................... ##
 # Set page configuration (Call this once and make changes as needed)
-st.set_page_config(page_title='News Scrapping',  layout='wide', page_icon=':rocket:')
 with st.container():
-  # Initialize Streamlit app
-  st.title('News Article Scrapping')
-  st.write("Created by Bayhaqy")
 ## ............................................... ##
 # Set up logging
@@ -140,34 +175,89 @@ def translate_text(text, source='auto', target='en'):
         logging.error(f"Translation error: {str(e)}")
         return text
 ## ............................................... ##
 with st.container():
-  # Input search parameters
-  search_term = st.text_input('Enter a search term :', 'Indonesia')
-  col1, col2, col3 = st.columns(3)
-  with col1:
-    period = st.text_input('Enter a news period :', '7d')
-    max_results = st.number_input('Maximum number of results :', min_value=1, value=10)
-  with col2:
-    country = st.text_input('Country :', 'Indonesia')
-    language = st.text_input('Language :', 'indonesian')
-  with col3:
-    start_date = st.date_input('Start Date :', pd.to_datetime('2023-01-01'))
-    end_date = st.date_input('End Date :', pd.to_datetime('2023-12-01'))
 ## ............................................... ##
 with st.container():
-  col1, col2 = st.columns(2)
-  with col1:
-    # Checkbox options for different processing steps
-    include_translation = st.checkbox("Include Translation", value=False)
-    include_sentiment_analysis = st.checkbox("Include Sentiment Analysis", value=False)
-  with col2:
-    include_sentiment_vader = st.checkbox("Include VADER Sentiment Analysis", value=False)
-    include_sentiment_textblob = st.checkbox("Include TextBlob Sentiment Analysis", value=False)
 ## ............................................... ##
 # Create a variable to track whether the data has been processed
@@ -188,84 +278,136 @@ model, tokenizer = get_models_and_tokenizers()
 ## ............................................... ##
 with st.container():
-  # Fetch news and process articles
-  if st.button('Fetch and Process News'):
-      # Your news retrieval code
-      google_news = GNews()
-      google_news.period = period  # News from last 7 days
-      google_news.max_results = max_results # number of responses across a keyword
-      google_news.country = country  # News from a specific country
-      google_news.language = language  # News in a specific language
-      #google_news.exclude_websites = ['yahoo.com', 'cnn.com']  # Exclude news from specific website i.e Yahoo.com and CNN.com
-      google_news.start_date = (start_date.year, start_date.month, start_date.day) # Search from 1st Jan 2023
-      google_news.end_date = (end_date.year, end_date.month, end_date.day) # Search until 1st Dec 2023
-      news = google_news.get_news(search_term)
-      ## ............................................... ##,
-      # Progress bar for fetching and processing news
-      progress_bar = st.progress(0)
-      total_news = len(news)
-      # Your news retrieval code (assuming 'news' is a list of article URLs)
-      #for x in news:
-      for idx, x in enumerate(news):
-          result = process_article(x['url'], _config=config)
-          if result is not None:
-              publish_date, language, url, source_url, title, authors, keywords, text, summary = result
-              temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Source_Url': [source_url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
                                       'Text': [text], 'Summary': [summary]})
-              df = pd.concat([df, temp_df], ignore_index=True)
-          # Update the progress bar
-          progress = (idx + 1) / total_news
-          progress_bar.progress(progress)
-      # Conditionally apply translation function to the 'Translation' column
-      if include_translation:
-          df['Translation'] = df.apply(lambda row: translate_text((row['Title'] + ' | ' + row['Summary']), source=row['Language'], target='en'), axis=1)
-      # Conditionally apply sentiment analysis function to the 'Translation' column
-      if include_sentiment_analysis:
-          df[['Fake_Check', 'Sentiment_Distilbert']] = df['Translation'].apply(lambda text: pd.Series(analyze_sentiment_distilbert(text, model, tokenizer)))
-      # Conditionally apply VADER sentiment analysis to the 'Translation' column
-      if include_sentiment_vader:
-          df['Sentiment_VADER'] = df['Translation'].apply(analyze_sentiment_vader)
-      # Conditionally apply TextBlob sentiment analysis to the 'Translation' column
-      if include_sentiment_textblob:
-          df['Sentiment_TextBlob'] = df['Translation'].apply(analyze_sentiment_textblob)
-      # Set data_processed to True when the data has been successfully processed
-      data_processed = True
-  ## ............................................... ##
-  # Add a button to download the data as a CSV file
-  if data_processed:
-      st.markdown("### Download Processed Data as CSV")
-      st.write("Click the button below to download the processed data as a CSV file.")
-      # Create a downloadable link
-      csv_data = df.to_csv(index=False).encode()
-      st.download_button(
-          label="Download CSV",
-          data=csv_data,
-          file_name="processed_data.csv",
-      )
-  with st.expander("See Table"):
     ## ............................................... ##
-    # Display processed data
     if data_processed:
-        st.dataframe(df)
-  with st.expander("See EDA"):
     ## ............................................... ##
     # Display processed data
-    if data_processed:
-      pr = ProfileReport(df)
-      st_profile_report(pr)

+%%writefile app.py
+# News Information and data article
+from newspaper import Article, Config
+from gnews import GNews
+# Data Analysis and Profiling
+import pandas as pd
 from ydata_profiling import ProfileReport
+from st_aggrid import AgGrid, GridOptionsBuilder
+# Streamlit for Building the Dashboard
 import streamlit as st
+from streamlit_pandas_profiling import st_profile_report
+# Language Detection
 from langdetect import detect
+# NLP and Text Processing
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 from deep_translator import GoogleTranslator
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from bs4 import BeautifulSoup
+# Sentiment Analysis
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 from textblob import TextBlob
+# URL Parsing
 from urllib.parse import urlparse
+# Data Visualization
+import plotly.express as px
+import matplotlib.pyplot as plt
+# Word Cloud Generation
+from wordcloud import WordCloud
+# Other Libraries
+import torch
+import requests
+import subprocess
+import logging
+import json
+import re
+import os
+# NLTK Data Download
+nltk.download('stopwords')
+nltk.download('wordnet')
 nltk.download('punkt')
 ## ............................................... ##
 # Set page configuration (Call this once and make changes as needed)
+st.set_page_config(page_title='News Scrapping',  layout='wide', page_icon=':newspaper:')
 with st.container():
+    # Initialize Streamlit app
+    st.title('News Article Scrapping')
+    st.write("Created by Bayhaqy")
 ## ............................................... ##
 # Set up logging
         logging.error(f"Translation error: {str(e)}")
         return text
+## ............................................... ##
+# Function to preprocess the data
+@st.cache_data
+def preprocessing_data(df):
+    # Remove duplicates
+    df = df.drop_duplicates(subset='Translation')
+    # Reset the index to add the date column
+    df.reset_index(inplace=True,drop=True)
+    # Function to clean and preprocess text
+    def clean_text(text):
+        # Remove URLs
+        text = re.sub(r'http\S+', '', text)
+        # Convert to lowercase
+        text = text.lower()
+        # Remove non-alphanumeric characters
+        text = re.sub(r'[^a-zA-Z\s]', '', text)
+        # Tokenize text
+        words = nltk.word_tokenize(text)
+        # Remove stopwords
+        stop_words = set(stopwords.words('english'))
+        words = [word for word in words if word not in stop_words]
+        # Lemmatize words
+        lemmatizer = WordNetLemmatizer()
+        words = [lemmatizer.lemmatize(word) for word in words]
+        return ' '.join(words)
+    # Apply the clean_text function to the "Translation" column
+    df['Cleaned Translation'] = df['Translation'].apply(clean_text)
+    return df
+## ............................................... ##
+# Function to create a Word Cloud
+@st.cache_data
+def create_wordcloud(df):
+    # Combine all text
+    text = ' '.join(df['Cleaned Translation'])
+    # Create a Word Cloud
+    wordcloud = WordCloud(width=700, height=400, max_words=80).generate(text)
+    # Convert the word cloud to an image
+    wordcloud_image = wordcloud.to_image()
+    # Display the Word Cloud using st.image
+    st.image(wordcloud_image, use_column_width=True)
 ## ............................................... ##
 with st.container():
+    # Input search parameters
+    search_term = st.text_input('Enter a search term :', 'Indonesia')
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        period = st.text_input('Enter a news period :', '7d')
+        max_results = st.number_input('Maximum number of results :', min_value=1, value=10)
+    with col2:
+        country = st.text_input('Country :', 'Indonesia')
+        language = st.text_input('Language :', 'indonesian')
+    with col3:
+        start_date = st.date_input('Start Date :', pd.to_datetime('2023-01-01'))
+        end_date = st.date_input('End Date :', pd.to_datetime('2023-12-01'))
 ## ............................................... ##
 with st.container():
+    col1, col2 = st.columns(2)
+    with col1:
+        # Checkbox options for different processing steps
+        include_translation = st.checkbox("Include Translation", value=True)
+        include_sentiment_analysis = st.checkbox("Include Sentiment Analysis", value=True)
+    with col2:
+        include_sentiment_vader = st.checkbox("Include VADER Sentiment Analysis", value=True)
+        include_sentiment_textblob = st.checkbox("Include TextBlob Sentiment Analysis", value=True)
 ## ............................................... ##
 # Create a variable to track whether the data has been processed
 ## ............................................... ##
 with st.container():
+    # Fetch news and process articles
+    if st.button('Fetch and Process News'):
+        # Your news retrieval code
+        google_news = GNews()
+        google_news.period = period  # News from last 7 days
+        google_news.max_results = max_results # number of responses across a keyword
+        google_news.country = country  # News from a specific country
+        google_news.language = language  # News in a specific language
+        #google_news.exclude_websites = ['yahoo.com', 'cnn.com']  # Exclude news from specific website i.e Yahoo.com and CNN.com
+        google_news.start_date = (start_date.year, start_date.month, start_date.day) # Search from 1st Jan 2023
+        google_news.end_date = (end_date.year, end_date.month, end_date.day) # Search until 1st Dec 2023
+        news = google_news.get_news(search_term)
+        ## ............................................... ##,
+        # Progress bar for fetching and processing news
+        progress_bar = st.progress(0)
+        total_news = len(news)
+        # Your news retrieval code (assuming 'news' is a list of article URLs)
+        #for x in news:
+        for idx, x in enumerate(news):
+            result = process_article(x['url'], _config=config)
+            if result is not None:
+                publish_date, language, url, source_url, title, authors, keywords, text, summary = result
+                # Insert to dataframe
+                temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Source_Url': [source_url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
                                       'Text': [text], 'Summary': [summary]})
+                df = pd.concat([df, temp_df], ignore_index=True)
+                # Convert 'Publish_Date' to DatetimeIndex
+                df['Publish_Date'] = pd.to_datetime(df['Publish_Date'])
+            # Update the progress bar
+            progress = (idx + 1) / total_news
+            progress_bar.progress(progress)
+        # Conditionally apply translation function to the 'Translation' column
+        if include_translation:
+            df['Translation'] = df.apply(lambda row: translate_text((row['Title'] + ' | ' + row['Summary']), source=row['Language'], target='en'), axis=1)
+            # Preprocessing Data
+            df = preprocessing_data(df)
+        # Conditionally apply sentiment analysis function to the 'Translation' column
+        if include_sentiment_analysis:
+            df[['Fake Check', 'Sentiment Distilbert']] = df['Translation'].apply(lambda text: pd.Series(analyze_sentiment_distilbert(text, model, tokenizer)))
+        # Conditionally apply VADER sentiment analysis to the 'Translation' column
+        if include_sentiment_vader:
+            df['Sentiment VADER'] = df['Translation'].apply(analyze_sentiment_vader)
+        # Conditionally apply TextBlob sentiment analysis to the 'Translation' column
+        if include_sentiment_textblob:
+            df['Sentiment TextBlob'] = df['Translation'].apply(analyze_sentiment_textblob)
+        # Set data_processed to True when the data has been successfully processed
+        data_processed = True
     ## ............................................... ##
+    # Add a button to download the data as a CSV file
     if data_processed:
+        st.markdown("### Download Processed Data as CSV")
+        st.write("Click the button below to download the processed data as a CSV file.")
+        # Create a downloadable link
+        csv_data = df.to_csv(index=False).encode()
+        st.download_button(
+            label="Download CSV",
+            data=csv_data,
+            file_name="processed_data.csv",
+        )
+    ## ............................................... ##
+    with st.expander("See for Table"):
+        # Display processed data
+        if data_processed:
+            #st.dataframe(df)
+            AgGrid(df, height=400)
     ## ............................................... ##
     # Display processed data
+    with st.expander("See for Exploratory Data Analysis"):
+        if data_processed:
+            col1, col2 = st.columns(2)
+            with col1:
+                ## ............................................... ##
+                # Create a DataFrame to count the number of tweets by Fake Check
+                FakeCheck_counts = df['Fake Check'].value_counts().reset_index()
+                FakeCheck_counts.columns = ['Fake Check', 'News Count']
+                fig = px.bar(FakeCheck_counts, x='Fake Check', y='News Count', text='News Count', title='Total News by Fake Check')
+                st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
+                ## ............................................... ##
+                # Create wordcloud
+                try:
+                    st.write('WordCloud for News')
+                    create_wordcloud(df)
+                except Exception as e:
+                    logging.error(f" Column Translation Not Available : {str(e)}")
+                ## ............................................... ##
+            with col2:
+                ## ............................................... ##
+                # Create a DataFrame to count the number of News by language
+                language_counts = df['Language'].value_counts().reset_index()
+                language_counts.columns = ['Language', 'News Count']
+                fig = px.bar(language_counts, x='Language', y='News Count', text='News Count', title='Total News by Language')
+                st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
+                ## ............................................... ##
+                # Group by Sentiment columns and get the count
+                try:
+                    sentiment_counts = df[['Sentiment Distilbert', 'Sentiment VADER', 'Sentiment TextBlob']].apply(lambda x: x.value_counts()).T
+                    sentiment_counts = sentiment_counts.reset_index()
+                    sentiment_counts = pd.melt(sentiment_counts, id_vars='index', var_name='Sentiment', value_name='Count')
+                    fig = px.bar(sentiment_counts, x='Sentiment', y='Count', color='index', barmode='group', title='Total News per Sentiment')
+                    st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
+                except Exception as e:
+                    logging.error(f" Column Sentiment Not Available : {str(e)}")
+                ## ............................................... ##
+    with st.expander("See for Analysis with ydata-profiling"):
+        ## ............................................... ##
+        # Display processed data
+        if data_processed:
+            pr = ProfileReport(df)
+            st_profile_report(pr)