Spaces:

Ashendilantha
/

News_Classification

Sleeping

App Files Files Community

Ashendilantha commited on Mar 30

Commit

42bdc4d

verified ·

1 Parent(s): e8f4b00

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -36

app.py CHANGED Viewed

@@ -8,9 +8,11 @@ from nltk.tokenize import word_tokenize
 from nltk.stem import WordNetLemmatizer
 import torch
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 # Set page configuration
-st.set_page_config(page_title="News Analysis App", layout="wide")
 # Download required NLTK resources
 @st.cache_resource
@@ -25,10 +27,10 @@ download_nltk_resources()
 stop_words = set(stopwords.words('english'))
 lemmatizer = WordNetLemmatizer()
-# Load classification model
 @st.cache_resource
 def load_classification_model():
-    model_name = "Oneli/News_Classification"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForSequenceClassification.from_pretrained(model_name)
     return model, tokenizer
@@ -39,24 +41,43 @@ def load_qa_pipeline():
     qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
     return qa_pipeline
-# Preprocessing function
 def preprocess_text(text):
     if pd.isna(text):
         return ""
     text = text.lower()
     text = re.sub(r'http\S+|www\S+|https\S+', '', text)
     text = re.sub(r'<.*?>', '', text)
     text = re.sub(r'[^a-zA-Z\s]', '', text)
     tokens = word_tokenize(text)
     cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
     cleaned_text = ' '.join(cleaned_tokens)
     return cleaned_text
-# Batch classification function
 def classify_news(df, model, tokenizer):
     df['cleaned_content'] = df['content'].apply(preprocess_text)
     texts = df['cleaned_content'].tolist()
     predictions = []
     batch_size = 16
@@ -70,68 +91,113 @@ def classify_news(df, model, tokenizer):
             batch_predictions = torch.argmax(logits, dim=1).tolist()
             predictions.extend(batch_predictions)
     id2label = model.config.id2label
     df['class'] = [id2label[pred] for pred in predictions]
     return df
 # Main app
 def main():
-    st.title("News Analysis Application")
     st.sidebar.title("Navigation")
     app_mode = st.sidebar.radio("Choose the app mode", ["News Classification", "Question Answering"])
     if app_mode == "News Classification":
-        st.header("News Article Classification")
-        uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
-        if uploaded_file is not None:
-            df = pd.read_csv(uploaded_file)
             st.subheader("Sample of uploaded data")
             st.dataframe(df.head())
             if 'content' not in df.columns:
-                st.error("The CSV file must contain a 'content' column.")
             else:
-                with st.spinner("Loading model..."):
                     model, tokenizer = load_classification_model()
                 if st.button("Classify Articles"):
                     with st.spinner("Classifying news articles..."):
                         result_df = classify_news(df, model, tokenizer)
                         st.subheader("Classification Results")
                         st.dataframe(result_df[['content', 'class']])
                         csv = result_df.to_csv(index=False)
-                        st.download_button("Download output.csv", csv, "output.csv", "text/csv")
                         st.subheader("Class Distribution")
-                        st.bar_chart(result_df['class'].value_counts())
     elif app_mode == "Question Answering":
-        st.header("News Article Q&A")
-        uploaded_file = st.file_uploader("Upload CSV for Q&A", type="csv")
-        if uploaded_file is not None:
-            df = pd.read_csv(uploaded_file)
-            if 'content' not in df.columns:
-                st.error("The CSV file must contain a 'content' column.")
-            else:
-                combined_text = " ".join(df['cleaned_content'].dropna().astype(str).tolist())
-                question = st.text_input("Enter your question about the news:")
-                if combined_text and question:
-                    with st.spinner("Loading Q&A model..."):
-                        qa_pipeline = load_qa_pipeline()
-                    if st.button("Get Answer"):
-                        with st.spinner("Finding answer..."):
-                            result = qa_pipeline(question=question, context=combined_text)
-                            st.subheader("Answer")
-                            st.write(result["answer"])
-                            st.subheader("Confidence")
-                            st.progress(float(result["score"]))
-                            st.write(f"Confidence Score: {result['score']:.4f}")
 if __name__ == "__main__":
     main()

 from nltk.stem import WordNetLemmatizer
 import torch
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+import requests
+from io import BytesIO
 # Set page configuration
+st.set_page_config(page_title="News Classifier", page_icon="📰")
 # Download required NLTK resources
 @st.cache_resource
 stop_words = set(stopwords.words('english'))
 lemmatizer = WordNetLemmatizer()
+# Load the fine-tuned model for classification
 @st.cache_resource
 def load_classification_model():
+    model_name = "Oneli/News_Classification"  # Replace with your actual model path
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForSequenceClassification.from_pretrained(model_name)
     return model, tokenizer
     qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
     return qa_pipeline
+# Text preprocessing function
 def preprocess_text(text):
     if pd.isna(text):
         return ""
+    # Convert to lowercase
     text = text.lower()
+    # Remove URLs
     text = re.sub(r'http\S+|www\S+|https\S+', '', text)
+    # Remove HTML tags
     text = re.sub(r'<.*?>', '', text)
+    # Remove special characters and numbers
     text = re.sub(r'[^a-zA-Z\s]', '', text)
+    # Tokenize
     tokens = word_tokenize(text)
+    # Remove stopwords and lemmatize
     cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
+    # Join tokens back into text
     cleaned_text = ' '.join(cleaned_tokens)
     return cleaned_text
+# Function to classify news articles with batch processing
 def classify_news(df, model, tokenizer):
+    # Preprocess the text
     df['cleaned_content'] = df['content'].apply(preprocess_text)
+    # Prepare for classification
     texts = df['cleaned_content'].tolist()
+    # Get predictions
     predictions = []
     batch_size = 16
             batch_predictions = torch.argmax(logits, dim=1).tolist()
             predictions.extend(batch_predictions)
+    # Map numeric predictions back to class labels
     id2label = model.config.id2label
     df['class'] = [id2label[pred] for pred in predictions]
     return df
 # Main app
 def main():
+    st.title("News Classifier 📢")
+    # Sidebar for navigation
     st.sidebar.title("Navigation")
     app_mode = st.sidebar.radio("Choose the app mode", ["News Classification", "Question Answering"])
+    # Section for Single Article Classification
     if app_mode == "News Classification":
+        st.header("📰 Single Article Classification")
+        st.write("Enter a news article or upload a CSV file to classify the content.")
+        # Text input for single article classification
+        text_input = st.text_area("Enter News Text", placeholder="Type or paste news content here...")
+        if st.button("🔍 Classify"):
+            if text_input:
+                # Load classification model
+                with st.spinner("Loading classification model..."):
+                    model, tokenizer = load_classification_model()
+                # Classify the text
+                with st.spinner("Classifying the article..."):
+                    category, confidence = classify_text(text_input, model, tokenizer)
+                    st.write(f"*Predicted Category:* {category}")
+                    st.write(f"*Confidence Level:* {confidence}%")
+            else:
+                st.warning("Please enter some text to classify.")
+        # File upload for bulk classification
+        st.subheader("📂 Bulk Classification (CSV)")
+        file_input = st.file_uploader("Upload CSV File", type="csv")
+        if file_input:
+            df = pd.read_csv(file_input)
+            # Display sample of the data
             st.subheader("Sample of uploaded data")
             st.dataframe(df.head())
+            # Check if the required column exists
             if 'content' not in df.columns:
+                st.error("The CSV file must contain a 'content' column with the news articles text.")
             else:
+                # Load model and tokenizer
+                with st.spinner("Loading classification model..."):
                     model, tokenizer = load_classification_model()
+                # Classify button
                 if st.button("Classify Articles"):
                     with st.spinner("Classifying news articles..."):
+                        # Perform classification
                         result_df = classify_news(df, model, tokenizer)
+                        # Display results
                         st.subheader("Classification Results")
                         st.dataframe(result_df[['content', 'class']])
+                        # Save to CSV
                         csv = result_df.to_csv(index=False)
+                        st.download_button(
+                            label="Download output.csv",
+                            data=csv,
+                            file_name="output.csv",
+                            mime="text/csv"
+                        )
+                        # Show distribution of classes
                         st.subheader("Class Distribution")
+                        class_counts = result_df['class'].value_counts()
+                        st.bar_chart(class_counts)
+    # Section for Question Answering
     elif app_mode == "Question Answering":
+        st.header("💬 AI Chat Assistant")
+        st.write("Ask questions about news content and get answers using a Q&A model.")
+        # Text area for news content
+        news_content = st.text_area("Paste news article content here:", height=200)
+        # Question input
+        question = st.text_input("Enter your question about the article:")
+        if news_content and question:
+            # Load QA pipeline
+            with st.spinner("Loading Q&A model..."):
+                qa_pipeline = load_qa_pipeline()
+            # Get answer
+            if st.button("Get Answer"):
+                with st.spinner("Finding answer..."):
+                    result = qa_pipeline(question=question, context=news_content)
+                    # Display results
+                    st.subheader("Answer")
+                    st.write(result["answer"])
+                    st.subheader("Confidence")
+                    st.progress(float(result["score"]))
+                    st.write(f"Confidence Score: {result['score']:.4f}")
 if __name__ == "__main__":
     main()