rahideer commited on
Commit
65b8f99
Β·
verified Β·
1 Parent(s): c0a9dbe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -11
app.py CHANGED
@@ -5,7 +5,7 @@ from transformers import pipeline
5
  import faiss
6
  import numpy as np
7
 
8
- # ---------- Custom CSS ----------
9
  def apply_custom_style():
10
  st.markdown("""
11
  <style>
@@ -48,9 +48,11 @@ def load_pdf_text(pdf_path):
48
  reader = PdfReader(pdf_path)
49
  text = ''
50
  for page in reader.pages:
51
- text += page.extract_text()
 
52
  return text
53
 
 
54
  def chunk_text(text, max_len=500):
55
  sentences = text.split('. ')
56
  chunks, chunk = [], ''
@@ -60,37 +62,46 @@ def chunk_text(text, max_len=500):
60
  else:
61
  chunks.append(chunk.strip())
62
  chunk = sentence + '. '
63
- chunks.append(chunk.strip())
 
64
  return chunks
65
 
 
66
  @st.cache_resource
67
  def embed_chunks(chunks):
68
  model = SentenceTransformer('all-MiniLM-L6-v2')
69
  embeddings = model.encode(chunks)
70
  return embeddings, model
71
 
 
72
  def answer_query(query, embeddings, chunks, model, qa_pipeline):
73
  query_embedding = model.encode([query])
74
  index = faiss.IndexFlatL2(embeddings.shape[1])
75
  index.add(np.array(embeddings))
76
- _, I = index.search(np.array(query_embedding), k=3)
77
- context = "\n".join([chunks[i] for i in I[0]])
78
  result = qa_pipeline(question=query, context=context)
79
  return result['answer']
80
 
81
  # ---------- App Layout ----------
82
  apply_custom_style()
83
  st.markdown('<div class="title">πŸ€– RAG PDF Q&A App</div>', unsafe_allow_html=True)
84
- st.markdown('<div class="subtitle">Ask questions about a machine learning PDF. Powered by Transformers!</div>', unsafe_allow_html=True)
85
 
86
- # Load and process PDF
87
  pdf_path = "ml_dataset_25_pages.pdf"
88
  raw_text = load_pdf_text(pdf_path)
89
  chunks = chunk_text(raw_text)
90
  embeddings, embedder = embed_chunks(chunks)
91
- qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
92
 
93
- # Sample questions
 
 
 
 
 
 
 
94
  st.markdown('<div class="question-box"><strong>πŸ’‘ Sample Questions:</strong>', unsafe_allow_html=True)
95
  sample_questions = [
96
  "What is supervised learning?",
@@ -103,8 +114,8 @@ for q in sample_questions:
103
  st.markdown(f'<div class="example">{q}</div>', unsafe_allow_html=True)
104
  st.markdown('</div>', unsafe_allow_html=True)
105
 
106
- # User Input
107
- query = st.text_input("πŸ”Ž Ask your question:")
108
  if query:
109
  with st.spinner("Thinking..."):
110
  answer = answer_query(query, embeddings, chunks, embedder, qa)
 
5
  import faiss
6
  import numpy as np
7
 
8
+ # ---------- Custom CSS for UI ----------
9
  def apply_custom_style():
10
  st.markdown("""
11
  <style>
 
48
  reader = PdfReader(pdf_path)
49
  text = ''
50
  for page in reader.pages:
51
+ if page.extract_text():
52
+ text += page.extract_text()
53
  return text
54
 
55
+ # ---------- Chunking ----------
56
  def chunk_text(text, max_len=500):
57
  sentences = text.split('. ')
58
  chunks, chunk = [], ''
 
62
  else:
63
  chunks.append(chunk.strip())
64
  chunk = sentence + '. '
65
+ if chunk:
66
+ chunks.append(chunk.strip())
67
  return chunks
68
 
69
+ # ---------- Embedding ----------
70
  @st.cache_resource
71
  def embed_chunks(chunks):
72
  model = SentenceTransformer('all-MiniLM-L6-v2')
73
  embeddings = model.encode(chunks)
74
  return embeddings, model
75
 
76
+ # ---------- RAG-Based QA ----------
77
  def answer_query(query, embeddings, chunks, model, qa_pipeline):
78
  query_embedding = model.encode([query])
79
  index = faiss.IndexFlatL2(embeddings.shape[1])
80
  index.add(np.array(embeddings))
81
+ _, I = index.search(np.array(query_embedding), k=5) # retrieve top 5 chunks
82
+ context = "\n\n".join([chunks[i] for i in I[0]]) # longer, better context
83
  result = qa_pipeline(question=query, context=context)
84
  return result['answer']
85
 
86
  # ---------- App Layout ----------
87
  apply_custom_style()
88
  st.markdown('<div class="title">πŸ€– RAG PDF Q&A App</div>', unsafe_allow_html=True)
89
+ st.markdown('<div class="subtitle">Ask questions about a machine learning PDF. Powered by Transformers and FAISS!</div>', unsafe_allow_html=True)
90
 
91
+ # ---------- Load PDF ----------
92
  pdf_path = "ml_dataset_25_pages.pdf"
93
  raw_text = load_pdf_text(pdf_path)
94
  chunks = chunk_text(raw_text)
95
  embeddings, embedder = embed_chunks(chunks)
 
96
 
97
+ # ---------- QA Pipeline ----------
98
+ qa = pipeline(
99
+ "question-answering",
100
+ model="deepset/roberta-base-squad2",
101
+ tokenizer="deepset/roberta-base-squad2"
102
+ )
103
+
104
+ # ---------- Sample Questions ----------
105
  st.markdown('<div class="question-box"><strong>πŸ’‘ Sample Questions:</strong>', unsafe_allow_html=True)
106
  sample_questions = [
107
  "What is supervised learning?",
 
114
  st.markdown(f'<div class="example">{q}</div>', unsafe_allow_html=True)
115
  st.markdown('</div>', unsafe_allow_html=True)
116
 
117
+ # ---------- User Query ----------
118
+ query = st.text_input("πŸ”Ž Ask your question here:")
119
  if query:
120
  with st.spinner("Thinking..."):
121
  answer = answer_query(query, embeddings, chunks, embedder, qa)