Update app.py
Browse files
app.py
CHANGED
@@ -5,7 +5,45 @@ from transformers import pipeline
|
|
5 |
import faiss
|
6 |
import numpy as np
|
7 |
|
8 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
def load_pdf_text(pdf_path):
|
10 |
reader = PdfReader(pdf_path)
|
11 |
text = ''
|
@@ -13,7 +51,6 @@ def load_pdf_text(pdf_path):
|
|
13 |
text += page.extract_text()
|
14 |
return text
|
15 |
|
16 |
-
# Split text into chunks
|
17 |
def chunk_text(text, max_len=500):
|
18 |
sentences = text.split('. ')
|
19 |
chunks, chunk = [], ''
|
@@ -26,14 +63,12 @@ def chunk_text(text, max_len=500):
|
|
26 |
chunks.append(chunk.strip())
|
27 |
return chunks
|
28 |
|
29 |
-
# Embed text using SentenceTransformer
|
30 |
@st.cache_resource
|
31 |
def embed_chunks(chunks):
|
32 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
33 |
embeddings = model.encode(chunks)
|
34 |
return embeddings, model
|
35 |
|
36 |
-
# RAG QA using FAISS index and QA pipeline
|
37 |
def answer_query(query, embeddings, chunks, model, qa_pipeline):
|
38 |
query_embedding = model.encode([query])
|
39 |
index = faiss.IndexFlatL2(embeddings.shape[1])
|
@@ -43,31 +78,34 @@ def answer_query(query, embeddings, chunks, model, qa_pipeline):
|
|
43 |
result = qa_pipeline(question=query, context=context)
|
44 |
return result['answer']
|
45 |
|
46 |
-
#
|
47 |
-
|
48 |
-
st.markdown("
|
|
|
49 |
|
50 |
-
# Load and process
|
51 |
pdf_path = "ml_dataset_25_pages.pdf"
|
52 |
raw_text = load_pdf_text(pdf_path)
|
53 |
chunks = chunk_text(raw_text)
|
54 |
embeddings, embedder = embed_chunks(chunks)
|
55 |
qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
|
56 |
|
57 |
-
#
|
58 |
-
st.
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
query = st.text_input("Enter your question below:")
|
70 |
|
|
|
|
|
71 |
if query:
|
72 |
-
|
73 |
-
|
|
|
|
5 |
import faiss
|
6 |
import numpy as np
|
7 |
|
8 |
+
# ---------- Custom CSS ----------
|
9 |
+
def apply_custom_style():
|
10 |
+
st.markdown("""
|
11 |
+
<style>
|
12 |
+
html, body, [class*="css"] {
|
13 |
+
font-family: 'Segoe UI', sans-serif;
|
14 |
+
background-color: #f0f4ff;
|
15 |
+
}
|
16 |
+
.title {
|
17 |
+
background: linear-gradient(to right, #4a90e2, #00c6ff);
|
18 |
+
-webkit-background-clip: text;
|
19 |
+
-webkit-text-fill-color: transparent;
|
20 |
+
font-size: 2.5em;
|
21 |
+
font-weight: bold;
|
22 |
+
}
|
23 |
+
.subtitle {
|
24 |
+
color: #444;
|
25 |
+
font-size: 1.2em;
|
26 |
+
margin-bottom: 1rem;
|
27 |
+
}
|
28 |
+
.question-box {
|
29 |
+
background-color: #fff;
|
30 |
+
padding: 1rem;
|
31 |
+
border-radius: 10px;
|
32 |
+
box-shadow: 0px 2px 10px rgba(0,0,0,0.1);
|
33 |
+
margin-bottom: 1rem;
|
34 |
+
}
|
35 |
+
.example {
|
36 |
+
color: #444;
|
37 |
+
background: #e9f0ff;
|
38 |
+
padding: 0.5rem;
|
39 |
+
border-radius: 8px;
|
40 |
+
margin: 3px 0;
|
41 |
+
cursor: pointer;
|
42 |
+
}
|
43 |
+
</style>
|
44 |
+
""", unsafe_allow_html=True)
|
45 |
+
|
46 |
+
# ---------- PDF Reading ----------
|
47 |
def load_pdf_text(pdf_path):
|
48 |
reader = PdfReader(pdf_path)
|
49 |
text = ''
|
|
|
51 |
text += page.extract_text()
|
52 |
return text
|
53 |
|
|
|
54 |
def chunk_text(text, max_len=500):
|
55 |
sentences = text.split('. ')
|
56 |
chunks, chunk = [], ''
|
|
|
63 |
chunks.append(chunk.strip())
|
64 |
return chunks
|
65 |
|
|
|
66 |
@st.cache_resource
|
67 |
def embed_chunks(chunks):
|
68 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
69 |
embeddings = model.encode(chunks)
|
70 |
return embeddings, model
|
71 |
|
|
|
72 |
def answer_query(query, embeddings, chunks, model, qa_pipeline):
|
73 |
query_embedding = model.encode([query])
|
74 |
index = faiss.IndexFlatL2(embeddings.shape[1])
|
|
|
78 |
result = qa_pipeline(question=query, context=context)
|
79 |
return result['answer']
|
80 |
|
81 |
+
# ---------- App Layout ----------
|
82 |
+
apply_custom_style()
|
83 |
+
st.markdown('<div class="title">π€ RAG PDF Q&A App</div>', unsafe_allow_html=True)
|
84 |
+
st.markdown('<div class="subtitle">Ask questions about a machine learning PDF. Powered by Transformers!</div>', unsafe_allow_html=True)
|
85 |
|
86 |
+
# Load and process PDF
|
87 |
pdf_path = "ml_dataset_25_pages.pdf"
|
88 |
raw_text = load_pdf_text(pdf_path)
|
89 |
chunks = chunk_text(raw_text)
|
90 |
embeddings, embedder = embed_chunks(chunks)
|
91 |
qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
|
92 |
|
93 |
+
# Sample questions
|
94 |
+
st.markdown('<div class="question-box"><strong>π‘ Sample Questions:</strong>', unsafe_allow_html=True)
|
95 |
+
sample_questions = [
|
96 |
+
"What is supervised learning?",
|
97 |
+
"Explain the difference between regression and classification.",
|
98 |
+
"What are the applications of machine learning?",
|
99 |
+
"How does decision tree algorithm work?",
|
100 |
+
"What is overfitting in machine learning?"
|
101 |
+
]
|
102 |
+
for q in sample_questions:
|
103 |
+
st.markdown(f'<div class="example">{q}</div>', unsafe_allow_html=True)
|
104 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
|
|
105 |
|
106 |
+
# User Input
|
107 |
+
query = st.text_input("π Ask your question:")
|
108 |
if query:
|
109 |
+
with st.spinner("Thinking..."):
|
110 |
+
answer = answer_query(query, embeddings, chunks, embedder, qa)
|
111 |
+
st.success(f"π§ Answer: {answer}")
|