Update app.py
Browse files
app.py
CHANGED
@@ -5,7 +5,7 @@ from transformers import pipeline
|
|
5 |
import faiss
|
6 |
import numpy as np
|
7 |
|
8 |
-
# ---------- Custom CSS ----------
|
9 |
def apply_custom_style():
|
10 |
st.markdown("""
|
11 |
<style>
|
@@ -48,9 +48,11 @@ def load_pdf_text(pdf_path):
|
|
48 |
reader = PdfReader(pdf_path)
|
49 |
text = ''
|
50 |
for page in reader.pages:
|
51 |
-
|
|
|
52 |
return text
|
53 |
|
|
|
54 |
def chunk_text(text, max_len=500):
|
55 |
sentences = text.split('. ')
|
56 |
chunks, chunk = [], ''
|
@@ -60,37 +62,46 @@ def chunk_text(text, max_len=500):
|
|
60 |
else:
|
61 |
chunks.append(chunk.strip())
|
62 |
chunk = sentence + '. '
|
63 |
-
|
|
|
64 |
return chunks
|
65 |
|
|
|
66 |
@st.cache_resource
|
67 |
def embed_chunks(chunks):
|
68 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
69 |
embeddings = model.encode(chunks)
|
70 |
return embeddings, model
|
71 |
|
|
|
72 |
def answer_query(query, embeddings, chunks, model, qa_pipeline):
|
73 |
query_embedding = model.encode([query])
|
74 |
index = faiss.IndexFlatL2(embeddings.shape[1])
|
75 |
index.add(np.array(embeddings))
|
76 |
-
_, I = index.search(np.array(query_embedding), k=
|
77 |
-
context = "\n".join([chunks[i] for i in I[0]])
|
78 |
result = qa_pipeline(question=query, context=context)
|
79 |
return result['answer']
|
80 |
|
81 |
# ---------- App Layout ----------
|
82 |
apply_custom_style()
|
83 |
st.markdown('<div class="title">π€ RAG PDF Q&A App</div>', unsafe_allow_html=True)
|
84 |
-
st.markdown('<div class="subtitle">Ask questions about a machine learning PDF. Powered by Transformers!</div>', unsafe_allow_html=True)
|
85 |
|
86 |
-
# Load
|
87 |
pdf_path = "ml_dataset_25_pages.pdf"
|
88 |
raw_text = load_pdf_text(pdf_path)
|
89 |
chunks = chunk_text(raw_text)
|
90 |
embeddings, embedder = embed_chunks(chunks)
|
91 |
-
qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
|
92 |
|
93 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
st.markdown('<div class="question-box"><strong>π‘ Sample Questions:</strong>', unsafe_allow_html=True)
|
95 |
sample_questions = [
|
96 |
"What is supervised learning?",
|
@@ -103,8 +114,8 @@ for q in sample_questions:
|
|
103 |
st.markdown(f'<div class="example">{q}</div>', unsafe_allow_html=True)
|
104 |
st.markdown('</div>', unsafe_allow_html=True)
|
105 |
|
106 |
-
# User
|
107 |
-
query = st.text_input("π Ask your question:")
|
108 |
if query:
|
109 |
with st.spinner("Thinking..."):
|
110 |
answer = answer_query(query, embeddings, chunks, embedder, qa)
|
|
|
5 |
import faiss
|
6 |
import numpy as np
|
7 |
|
8 |
+
# ---------- Custom CSS for UI ----------
|
9 |
def apply_custom_style():
|
10 |
st.markdown("""
|
11 |
<style>
|
|
|
48 |
reader = PdfReader(pdf_path)
|
49 |
text = ''
|
50 |
for page in reader.pages:
|
51 |
+
if page.extract_text():
|
52 |
+
text += page.extract_text()
|
53 |
return text
|
54 |
|
55 |
+
# ---------- Chunking ----------
|
56 |
def chunk_text(text, max_len=500):
|
57 |
sentences = text.split('. ')
|
58 |
chunks, chunk = [], ''
|
|
|
62 |
else:
|
63 |
chunks.append(chunk.strip())
|
64 |
chunk = sentence + '. '
|
65 |
+
if chunk:
|
66 |
+
chunks.append(chunk.strip())
|
67 |
return chunks
|
68 |
|
69 |
+
# ---------- Embedding ----------
|
70 |
@st.cache_resource
|
71 |
def embed_chunks(chunks):
|
72 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
73 |
embeddings = model.encode(chunks)
|
74 |
return embeddings, model
|
75 |
|
76 |
+
# ---------- RAG-Based QA ----------
|
77 |
def answer_query(query, embeddings, chunks, model, qa_pipeline):
|
78 |
query_embedding = model.encode([query])
|
79 |
index = faiss.IndexFlatL2(embeddings.shape[1])
|
80 |
index.add(np.array(embeddings))
|
81 |
+
_, I = index.search(np.array(query_embedding), k=5) # retrieve top 5 chunks
|
82 |
+
context = "\n\n".join([chunks[i] for i in I[0]]) # longer, better context
|
83 |
result = qa_pipeline(question=query, context=context)
|
84 |
return result['answer']
|
85 |
|
86 |
# ---------- App Layout ----------
|
87 |
apply_custom_style()
|
88 |
st.markdown('<div class="title">π€ RAG PDF Q&A App</div>', unsafe_allow_html=True)
|
89 |
+
st.markdown('<div class="subtitle">Ask questions about a machine learning PDF. Powered by Transformers and FAISS!</div>', unsafe_allow_html=True)
|
90 |
|
91 |
+
# ---------- Load PDF ----------
|
92 |
pdf_path = "ml_dataset_25_pages.pdf"
|
93 |
raw_text = load_pdf_text(pdf_path)
|
94 |
chunks = chunk_text(raw_text)
|
95 |
embeddings, embedder = embed_chunks(chunks)
|
|
|
96 |
|
97 |
+
# ---------- QA Pipeline ----------
|
98 |
+
qa = pipeline(
|
99 |
+
"question-answering",
|
100 |
+
model="deepset/roberta-base-squad2",
|
101 |
+
tokenizer="deepset/roberta-base-squad2"
|
102 |
+
)
|
103 |
+
|
104 |
+
# ---------- Sample Questions ----------
|
105 |
st.markdown('<div class="question-box"><strong>π‘ Sample Questions:</strong>', unsafe_allow_html=True)
|
106 |
sample_questions = [
|
107 |
"What is supervised learning?",
|
|
|
114 |
st.markdown(f'<div class="example">{q}</div>', unsafe_allow_html=True)
|
115 |
st.markdown('</div>', unsafe_allow_html=True)
|
116 |
|
117 |
+
# ---------- User Query ----------
|
118 |
+
query = st.text_input("π Ask your question here:")
|
119 |
if query:
|
120 |
with st.spinner("Thinking..."):
|
121 |
answer = answer_query(query, embeddings, chunks, embedder, qa)
|