Spaces:

Sandini
/

News_Analyzer

Sleeping

App Files Files Community

Sandini commited on Mar 29

Commit

1651757

verified ·

1 Parent(s): 75f7244

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -14

app.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import streamlit as st
 import pandas as pd
 from transformers import pipeline
-from sentence_transformers import CrossEncoder
-import nltk
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 # Download NLTK resources (run this once if not already downloaded)
 nltk.download('punkt')
@@ -97,14 +99,11 @@ st.markdown("<div class='custom-header'> 🧩 AI-Powered News Analyzer</div>", u
 classifier = pipeline("text-classification", model="Sandini/news-classifier")  # Classification pipeline
 qa_pipeline = pipeline("question-answering", model="distilbert/distilbert-base-cased-distilled-squad")  # QA pipeline
-# Initialize Cross-Encoder for QA relevance scoring
-cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')  # Pre-trained Cross-Encoder model
 # Define preprocessing functions for classification
 def preprocess_text(text):
-    if not isinstance(text, str):
-        text = ""
     # Step 1: Lowercase the text
     text = text.lower()
@@ -162,7 +161,6 @@ with col1:
         # Preprocess the content column and predict categories
         if 'content' in df.columns:
-            df['content'] = df['content'].fillna("").astype(str)
             df['preprocessed_content'] = df['content'].apply(preprocess_text)
             df['class'] = df['preprocessed_content'].apply(predict_category)
@@ -182,6 +180,7 @@ with col1:
         st.markdown("<div class='csv-box'><h4>📜 CSV/Excel Preview</h4></div>", unsafe_allow_html=True)
         st.dataframe(df_for_display, use_container_width=True)
 # Right Section - Q&A Interface
 with col2:
     st.subheader("🤖 AI Assistant")
@@ -201,14 +200,15 @@ with col2:
             if 'content' in df.columns:
                 context = df['content'].dropna().tolist()  # Use the content column as context
-                # Prepare pairs of (question, context)
-                pairs = [(user_question, c) for c in context]
-                # Score each pair using the Cross-Encoder
-                scores = cross_encoder.predict(pairs)
-                # Get top matches based on scores
-                top_indices = scores.argsort()[-5:][::-1]  # Get indices of top 5 matches
                 top_context = "\n".join([context[i] for i in top_indices])
                 # Get answer from Hugging Face model using top context

 import streamlit as st
 import pandas as pd
 from transformers import pipeline
+from sklearn.metrics.pairwise import cosine_similarity
+from sentence_transformers import SentenceTransformer
+import string
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
+import nltk
 # Download NLTK resources (run this once if not already downloaded)
 nltk.download('punkt')
 classifier = pipeline("text-classification", model="Sandini/news-classifier")  # Classification pipeline
 qa_pipeline = pipeline("question-answering", model="distilbert/distilbert-base-cased-distilled-squad")  # QA pipeline
+# Initialize sentence transformer model for QA similarity
+sentence_model = SentenceTransformer('all-MiniLM-L6-v2')  # Pre-trained sentence model
 # Define preprocessing functions for classification
 def preprocess_text(text):
     # Step 1: Lowercase the text
     text = text.lower()
         # Preprocess the content column and predict categories
         if 'content' in df.columns:
             df['preprocessed_content'] = df['content'].apply(preprocess_text)
             df['class'] = df['preprocessed_content'].apply(predict_category)
         st.markdown("<div class='csv-box'><h4>📜 CSV/Excel Preview</h4></div>", unsafe_allow_html=True)
         st.dataframe(df_for_display, use_container_width=True)
 # Right Section - Q&A Interface
 with col2:
     st.subheader("🤖 AI Assistant")
             if 'content' in df.columns:
                 context = df['content'].dropna().tolist()  # Use the content column as context
+                # Generate embeddings for the context and the question
+                context_embeddings = sentence_model.encode(context)
+                question_embedding = sentence_model.encode([user_question])
+                # Calculate cosine similarity
+                similarities = cosine_similarity(question_embedding, context_embeddings)
+                top_indices = similarities[0].argsort()[-5:][::-1]  # Get top 5 similar rows
+                # Prepare the top 5 similar context rows
                 top_context = "\n".join([context[i] for i in top_indices])
                 # Get answer from Hugging Face model using top context