Spaces:

TAgroup5
/

demo-News_classifier

Sleeping

File size: 3,883 Bytes

246133f
 
5cdc45c
237a63b
72fcc70
5cdc45c
98f90ad
 
08acf50
ff72b25
 
6ad09a0
08acf50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ad09a0
 
ff72b25
 
246133f
5cdc45c
237a63b
246133f
5cdc45c
237a63b
 
5cdc45c
237a63b
5cdc45c
237a63b
 
246133f
 
5cdc45c
 
 
 
 
237a63b
5cdc45c
237a63b
5cdc45c
237a63b
246133f
6ad09a0
237a63b
ff871dc
 
ff72b25
 
 
5cdc45c
237a63b
 
6ad09a0
 
5cdc45c
 
 
237a63b
 
246133f
5cdc45c
 
 
237a63b
246133f
5cdc45c
237a63b
5cdc45c
237a63b
 
5cdc45c
237a63b
5cdc45c
6ad09a0
 
5cdc45c
98f90ad

import streamlit as st
import pandas as pd
import re
import io
import string
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import os
import nltk
from nltk.stem import WordNetLemmatizer

# Specify the directory for nltk_data
nltk_data_dir = '/root/nltk_data'

# Ensure the directory exists
if not os.path.exists(nltk_data_dir):
    os.makedirs(nltk_data_dir)

# Set the NLTK data path explicitly
nltk.data.path.append(nltk_data_dir)

# Try downloading required NLTK resources
try:
    nltk.download('punkt', download_dir=nltk_data_dir)
    nltk.download('stopwords', download_dir=nltk_data_dir)
    nltk.download('wordnet', download_dir=nltk_data_dir)
except Exception as e:
    print(f"Error while downloading NLTK resources: {e}")

# Load fine-tuned model and tokenizer (adjust the model name)
model_name = "TAgroup5/news-classification-model"  # Replace with the correct model name
model = AutoModelForSequenceClassification.from_pretrained(news-classification-model)
tokenizer = AutoTokenizer.from_pretrained(news-classification-model)

# Initialize pipelines
text_classification_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Streamlit App
st.title("News Classification and Q&A")

## ====================== Component 1: News Classification ====================== ##
st.header("Classify News Articles")
st.markdown("Upload a CSV file with a 'content' column to classify news into categories.")

uploaded_file = st.file_uploader("Choose a CSV file", type="csv")

if uploaded_file is not None:
    try:
        df = pd.read_csv(uploaded_file, encoding="utf-8")  # Handle encoding issues
    except UnicodeDecodeError:
        df = pd.read_csv(uploaded_file, encoding="ISO-8859-1")

    if 'content' not in df.columns:
        st.error("Error: The uploaded CSV must contain a 'content' column.")
    else:
        st.write("Preview of uploaded data:")
        st.dataframe(df.head())

        # Preprocessing function to clean the text
        def preprocess_text(text):
            text = text.lower()  # Convert to lowercase
            text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters & numbers
            # You don't need tokenization here, as the model tokenizer will handle it
            return text


        # Apply preprocessing and classification
        df['processed_content'] = df['content'].apply(preprocess_text)
        
        # Classify each record into one of the five classes
        df['class'] = df['processed_content'].apply(lambda x: text_classification_pipeline(x)[0]['label'] if x.strip() else "Unknown")

        # Show results
        st.write("Classification Results:")
        st.dataframe(df[['content', 'class']])

        # Provide CSV download
        output = io.BytesIO()
        df.to_csv(output, index=False, encoding="utf-8-sig")
        st.download_button(label="Download classified news", data=output.getvalue(), file_name="output.csv", mime="text/csv")

## ====================== Component 2: Q&A ====================== ##
st.header("Ask a Question About the News")
st.markdown("Enter a question and provide a news article to get an answer.")

question = st.text_input("Ask a question:")
context = st.text_area("Provide the news article or content for the Q&A:", height=150)

if question and context.strip():
    qa_model_name = "distilbert-base-uncased-distilled-squad"  # Example of a common Q&A model
    qa_pipeline = pipeline("question-answering", model=qa_model_name, tokenizer=qa_model_name)
    result = qa_pipeline(question=question, context=context)
    
    # Check if the result contains an answer
    if 'answer' in result and result['answer']:
        st.write("Answer:", result['answer'])
    else:
        st.write("No answer found in the provided content.")