Spaces:

rahideer
/

dataset

Sleeping

App Files Files Community

rahideer commited on Apr 18

Commit

65b8f99

verified ·

1 Parent(s): c0a9dbe

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -11

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ from transformers import pipeline
 import faiss
 import numpy as np
-# ---------- Custom CSS ----------
 def apply_custom_style():
     st.markdown("""
         <style>
@@ -48,9 +48,11 @@ def load_pdf_text(pdf_path):
     reader = PdfReader(pdf_path)
     text = ''
     for page in reader.pages:
-        text += page.extract_text()
     return text
 def chunk_text(text, max_len=500):
     sentences = text.split('. ')
     chunks, chunk = [], ''
@@ -60,37 +62,46 @@ def chunk_text(text, max_len=500):
         else:
             chunks.append(chunk.strip())
             chunk = sentence + '. '
-    chunks.append(chunk.strip())
     return chunks
 @st.cache_resource
 def embed_chunks(chunks):
     model = SentenceTransformer('all-MiniLM-L6-v2')
     embeddings = model.encode(chunks)
     return embeddings, model
 def answer_query(query, embeddings, chunks, model, qa_pipeline):
     query_embedding = model.encode([query])
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(np.array(embeddings))
-    _, I = index.search(np.array(query_embedding), k=3)
-    context = "\n".join([chunks[i] for i in I[0]])
     result = qa_pipeline(question=query, context=context)
     return result['answer']
 # ---------- App Layout ----------
 apply_custom_style()
 st.markdown('<div class="title">🤖 RAG PDF Q&A App</div>', unsafe_allow_html=True)
-st.markdown('<div class="subtitle">Ask questions about a machine learning PDF. Powered by Transformers!</div>', unsafe_allow_html=True)
-# Load and process PDF
 pdf_path = "ml_dataset_25_pages.pdf"
 raw_text = load_pdf_text(pdf_path)
 chunks = chunk_text(raw_text)
 embeddings, embedder = embed_chunks(chunks)
-qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
-# Sample questions
 st.markdown('<div class="question-box"><strong>💡 Sample Questions:</strong>', unsafe_allow_html=True)
 sample_questions = [
     "What is supervised learning?",
@@ -103,8 +114,8 @@ for q in sample_questions:
     st.markdown(f'<div class="example">{q}</div>', unsafe_allow_html=True)
 st.markdown('</div>', unsafe_allow_html=True)
-# User Input
-query = st.text_input("🔎 Ask your question:")
 if query:
     with st.spinner("Thinking..."):
         answer = answer_query(query, embeddings, chunks, embedder, qa)

 import faiss
 import numpy as np
+# ---------- Custom CSS for UI ----------
 def apply_custom_style():
     st.markdown("""
         <style>
     reader = PdfReader(pdf_path)
     text = ''
     for page in reader.pages:
+        if page.extract_text():
+            text += page.extract_text()
     return text
+# ---------- Chunking ----------
 def chunk_text(text, max_len=500):
     sentences = text.split('. ')
     chunks, chunk = [], ''
         else:
             chunks.append(chunk.strip())
             chunk = sentence + '. '
+    if chunk:
+        chunks.append(chunk.strip())
     return chunks
+# ---------- Embedding ----------
 @st.cache_resource
 def embed_chunks(chunks):
     model = SentenceTransformer('all-MiniLM-L6-v2')
     embeddings = model.encode(chunks)
     return embeddings, model
+# ---------- RAG-Based QA ----------
 def answer_query(query, embeddings, chunks, model, qa_pipeline):
     query_embedding = model.encode([query])
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(np.array(embeddings))
+    _, I = index.search(np.array(query_embedding), k=5)  # retrieve top 5 chunks
+    context = "\n\n".join([chunks[i] for i in I[0]])     # longer, better context
     result = qa_pipeline(question=query, context=context)
     return result['answer']
 # ---------- App Layout ----------
 apply_custom_style()
 st.markdown('<div class="title">🤖 RAG PDF Q&A App</div>', unsafe_allow_html=True)
+st.markdown('<div class="subtitle">Ask questions about a machine learning PDF. Powered by Transformers and FAISS!</div>', unsafe_allow_html=True)
+# ---------- Load PDF ----------
 pdf_path = "ml_dataset_25_pages.pdf"
 raw_text = load_pdf_text(pdf_path)
 chunks = chunk_text(raw_text)
 embeddings, embedder = embed_chunks(chunks)
+# ---------- QA Pipeline ----------
+qa = pipeline(
+    "question-answering",
+    model="deepset/roberta-base-squad2",
+    tokenizer="deepset/roberta-base-squad2"
+)
+# ---------- Sample Questions ----------
 st.markdown('<div class="question-box"><strong>💡 Sample Questions:</strong>', unsafe_allow_html=True)
 sample_questions = [
     "What is supervised learning?",
     st.markdown(f'<div class="example">{q}</div>', unsafe_allow_html=True)
 st.markdown('</div>', unsafe_allow_html=True)
+# ---------- User Query ----------
+query = st.text_input("🔎 Ask your question here:")
 if query:
     with st.spinner("Thinking..."):
         answer = answer_query(query, embeddings, chunks, embedder, qa)