Sandini commited on
Commit
1651757
Β·
verified Β·
1 Parent(s): 75f7244

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -14
app.py CHANGED
@@ -1,11 +1,13 @@
1
  import streamlit as st
2
  import pandas as pd
3
  from transformers import pipeline
4
- from sentence_transformers import CrossEncoder
5
- import nltk
 
6
  from nltk.tokenize import word_tokenize
7
  from nltk.corpus import stopwords
8
  from nltk.stem import WordNetLemmatizer
 
9
 
10
  # Download NLTK resources (run this once if not already downloaded)
11
  nltk.download('punkt')
@@ -97,14 +99,11 @@ st.markdown("<div class='custom-header'> 🧩 AI-Powered News Analyzer</div>", u
97
  classifier = pipeline("text-classification", model="Sandini/news-classifier") # Classification pipeline
98
  qa_pipeline = pipeline("question-answering", model="distilbert/distilbert-base-cased-distilled-squad") # QA pipeline
99
 
100
- # Initialize Cross-Encoder for QA relevance scoring
101
- cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') # Pre-trained Cross-Encoder model
102
 
103
  # Define preprocessing functions for classification
104
  def preprocess_text(text):
105
- if not isinstance(text, str):
106
- text = ""
107
-
108
  # Step 1: Lowercase the text
109
  text = text.lower()
110
 
@@ -162,7 +161,6 @@ with col1:
162
 
163
  # Preprocess the content column and predict categories
164
  if 'content' in df.columns:
165
- df['content'] = df['content'].fillna("").astype(str)
166
  df['preprocessed_content'] = df['content'].apply(preprocess_text)
167
  df['class'] = df['preprocessed_content'].apply(predict_category)
168
 
@@ -182,6 +180,7 @@ with col1:
182
  st.markdown("<div class='csv-box'><h4>πŸ“œ CSV/Excel Preview</h4></div>", unsafe_allow_html=True)
183
  st.dataframe(df_for_display, use_container_width=True)
184
 
 
185
  # Right Section - Q&A Interface
186
  with col2:
187
  st.subheader("πŸ€– AI Assistant")
@@ -201,14 +200,15 @@ with col2:
201
  if 'content' in df.columns:
202
  context = df['content'].dropna().tolist() # Use the content column as context
203
 
204
- # Prepare pairs of (question, context)
205
- pairs = [(user_question, c) for c in context]
 
206
 
207
- # Score each pair using the Cross-Encoder
208
- scores = cross_encoder.predict(pairs)
 
209
 
210
- # Get top matches based on scores
211
- top_indices = scores.argsort()[-5:][::-1] # Get indices of top 5 matches
212
  top_context = "\n".join([context[i] for i in top_indices])
213
 
214
  # Get answer from Hugging Face model using top context
 
1
  import streamlit as st
2
  import pandas as pd
3
  from transformers import pipeline
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ from sentence_transformers import SentenceTransformer
6
+ import string
7
  from nltk.tokenize import word_tokenize
8
  from nltk.corpus import stopwords
9
  from nltk.stem import WordNetLemmatizer
10
+ import nltk
11
 
12
  # Download NLTK resources (run this once if not already downloaded)
13
  nltk.download('punkt')
 
99
  classifier = pipeline("text-classification", model="Sandini/news-classifier") # Classification pipeline
100
  qa_pipeline = pipeline("question-answering", model="distilbert/distilbert-base-cased-distilled-squad") # QA pipeline
101
 
102
+ # Initialize sentence transformer model for QA similarity
103
+ sentence_model = SentenceTransformer('all-MiniLM-L6-v2') # Pre-trained sentence model
104
 
105
  # Define preprocessing functions for classification
106
  def preprocess_text(text):
 
 
 
107
  # Step 1: Lowercase the text
108
  text = text.lower()
109
 
 
161
 
162
  # Preprocess the content column and predict categories
163
  if 'content' in df.columns:
 
164
  df['preprocessed_content'] = df['content'].apply(preprocess_text)
165
  df['class'] = df['preprocessed_content'].apply(predict_category)
166
 
 
180
  st.markdown("<div class='csv-box'><h4>πŸ“œ CSV/Excel Preview</h4></div>", unsafe_allow_html=True)
181
  st.dataframe(df_for_display, use_container_width=True)
182
 
183
+
184
  # Right Section - Q&A Interface
185
  with col2:
186
  st.subheader("πŸ€– AI Assistant")
 
200
  if 'content' in df.columns:
201
  context = df['content'].dropna().tolist() # Use the content column as context
202
 
203
+ # Generate embeddings for the context and the question
204
+ context_embeddings = sentence_model.encode(context)
205
+ question_embedding = sentence_model.encode([user_question])
206
 
207
+ # Calculate cosine similarity
208
+ similarities = cosine_similarity(question_embedding, context_embeddings)
209
+ top_indices = similarities[0].argsort()[-5:][::-1] # Get top 5 similar rows
210
 
211
+ # Prepare the top 5 similar context rows
 
212
  top_context = "\n".join([context[i] for i in top_indices])
213
 
214
  # Get answer from Hugging Face model using top context