Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
from transformers import pipeline
|
4 |
-
from
|
5 |
-
import
|
|
|
6 |
from nltk.tokenize import word_tokenize
|
7 |
from nltk.corpus import stopwords
|
8 |
from nltk.stem import WordNetLemmatizer
|
|
|
9 |
|
10 |
# Download NLTK resources (run this once if not already downloaded)
|
11 |
nltk.download('punkt')
|
@@ -97,14 +99,11 @@ st.markdown("<div class='custom-header'> π§© AI-Powered News Analyzer</div>", u
|
|
97 |
classifier = pipeline("text-classification", model="Sandini/news-classifier") # Classification pipeline
|
98 |
qa_pipeline = pipeline("question-answering", model="distilbert/distilbert-base-cased-distilled-squad") # QA pipeline
|
99 |
|
100 |
-
# Initialize
|
101 |
-
|
102 |
|
103 |
# Define preprocessing functions for classification
|
104 |
def preprocess_text(text):
|
105 |
-
if not isinstance(text, str):
|
106 |
-
text = ""
|
107 |
-
|
108 |
# Step 1: Lowercase the text
|
109 |
text = text.lower()
|
110 |
|
@@ -162,7 +161,6 @@ with col1:
|
|
162 |
|
163 |
# Preprocess the content column and predict categories
|
164 |
if 'content' in df.columns:
|
165 |
-
df['content'] = df['content'].fillna("").astype(str)
|
166 |
df['preprocessed_content'] = df['content'].apply(preprocess_text)
|
167 |
df['class'] = df['preprocessed_content'].apply(predict_category)
|
168 |
|
@@ -182,6 +180,7 @@ with col1:
|
|
182 |
st.markdown("<div class='csv-box'><h4>π CSV/Excel Preview</h4></div>", unsafe_allow_html=True)
|
183 |
st.dataframe(df_for_display, use_container_width=True)
|
184 |
|
|
|
185 |
# Right Section - Q&A Interface
|
186 |
with col2:
|
187 |
st.subheader("π€ AI Assistant")
|
@@ -201,14 +200,15 @@ with col2:
|
|
201 |
if 'content' in df.columns:
|
202 |
context = df['content'].dropna().tolist() # Use the content column as context
|
203 |
|
204 |
-
#
|
205 |
-
|
|
|
206 |
|
207 |
-
#
|
208 |
-
|
|
|
209 |
|
210 |
-
#
|
211 |
-
top_indices = scores.argsort()[-5:][::-1] # Get indices of top 5 matches
|
212 |
top_context = "\n".join([context[i] for i in top_indices])
|
213 |
|
214 |
# Get answer from Hugging Face model using top context
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
from transformers import pipeline
|
4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
+
from sentence_transformers import SentenceTransformer
|
6 |
+
import string
|
7 |
from nltk.tokenize import word_tokenize
|
8 |
from nltk.corpus import stopwords
|
9 |
from nltk.stem import WordNetLemmatizer
|
10 |
+
import nltk
|
11 |
|
12 |
# Download NLTK resources (run this once if not already downloaded)
|
13 |
nltk.download('punkt')
|
|
|
99 |
classifier = pipeline("text-classification", model="Sandini/news-classifier") # Classification pipeline
|
100 |
qa_pipeline = pipeline("question-answering", model="distilbert/distilbert-base-cased-distilled-squad") # QA pipeline
|
101 |
|
102 |
+
# Initialize sentence transformer model for QA similarity
|
103 |
+
sentence_model = SentenceTransformer('all-MiniLM-L6-v2') # Pre-trained sentence model
|
104 |
|
105 |
# Define preprocessing functions for classification
|
106 |
def preprocess_text(text):
|
|
|
|
|
|
|
107 |
# Step 1: Lowercase the text
|
108 |
text = text.lower()
|
109 |
|
|
|
161 |
|
162 |
# Preprocess the content column and predict categories
|
163 |
if 'content' in df.columns:
|
|
|
164 |
df['preprocessed_content'] = df['content'].apply(preprocess_text)
|
165 |
df['class'] = df['preprocessed_content'].apply(predict_category)
|
166 |
|
|
|
180 |
st.markdown("<div class='csv-box'><h4>π CSV/Excel Preview</h4></div>", unsafe_allow_html=True)
|
181 |
st.dataframe(df_for_display, use_container_width=True)
|
182 |
|
183 |
+
|
184 |
# Right Section - Q&A Interface
|
185 |
with col2:
|
186 |
st.subheader("π€ AI Assistant")
|
|
|
200 |
if 'content' in df.columns:
|
201 |
context = df['content'].dropna().tolist() # Use the content column as context
|
202 |
|
203 |
+
# Generate embeddings for the context and the question
|
204 |
+
context_embeddings = sentence_model.encode(context)
|
205 |
+
question_embedding = sentence_model.encode([user_question])
|
206 |
|
207 |
+
# Calculate cosine similarity
|
208 |
+
similarities = cosine_similarity(question_embedding, context_embeddings)
|
209 |
+
top_indices = similarities[0].argsort()[-5:][::-1] # Get top 5 similar rows
|
210 |
|
211 |
+
# Prepare the top 5 similar context rows
|
|
|
212 |
top_context = "\n".join([context[i] for i in top_indices])
|
213 |
|
214 |
# Get answer from Hugging Face model using top context
|