Spaces:

TAgroup5
/

demo-News_classifier

Sleeping

App Files Files Community

TAgroup5 commited on Mar 28

Commit

66d1db7

verified ·

1 Parent(s): ff72b25

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -24

app.py CHANGED Viewed

@@ -2,31 +2,8 @@ import streamlit as st
 import pandas as pd
 import re
 import io
-import string
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
-from nltk.tokenize import word_tokenize
-from nltk.corpus import stopwords
-import os
-import nltk
-from nltk.stem import WordNetLemmatizer
-# Specify the directory for nltk_data
-nltk_data_dir = '/root/nltk_data'
-# Ensure the directory exists
-if not os.path.exists(nltk_data_dir):
-    os.makedirs(nltk_data_dir)
-# Set the NLTK data path explicitly
-nltk.data.path.append(nltk_data_dir)
-# Try downloading required NLTK resources
-try:
-    nltk.download('punkt', download_dir=nltk_data_dir)
-    nltk.download('stopwords', download_dir=nltk_data_dir)
-    nltk.download('wordnet', download_dir=nltk_data_dir)
-except Exception as e:
-    print(f"Error while downloading NLTK resources: {e}")
 # Load fine-tuned model and tokenizer (adjust the model name)
 model_name = "TAgroup5/news-classification-model"  # Replace with the correct model name
@@ -35,6 +12,8 @@ tokenizer = AutoTokenizer.from_pretrained(news-classification-model)
 # Initialize pipelines
 text_classification_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)
 # Streamlit App
 st.title("News Classification and Q&A")
@@ -60,6 +39,7 @@ if uploaded_file is not None:
         # Preprocessing function to clean the text
         def preprocess_text(text):
             text = text.lower()  # Convert to lowercase
             text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters & numbers
             # You don't need tokenization here, as the model tokenizer will handle it
             return text

 import pandas as pd
 import re
 import io
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 # Load fine-tuned model and tokenizer (adjust the model name)
 model_name = "TAgroup5/news-classification-model"  # Replace with the correct model name
 # Initialize pipelines
 text_classification_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)
+qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
 # Streamlit App
 st.title("News Classification and Q&A")
         # Preprocessing function to clean the text
         def preprocess_text(text):
             text = text.lower()  # Convert to lowercase
+            text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
             text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters & numbers
             # You don't need tokenization here, as the model tokenizer will handle it
             return text