rahideer commited on
Commit
743388b
Β·
verified Β·
1 Parent(s): d28742c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -71
app.py CHANGED
@@ -1,98 +1,64 @@
1
  import streamlit as st
2
  import PyPDF2
 
3
  from sentence_transformers import SentenceTransformer
4
  import faiss
5
  import numpy as np
6
  from transformers import pipeline
7
 
8
- st.set_page_config(page_title="πŸ“˜ PDF QA RAG App", layout="wide")
9
 
10
  # Custom styles
11
  st.markdown("""
12
  <style>
13
  .main {background-color: #f7faff;}
14
- .block-container {padding-top: 2rem;}
15
- h1 {color: #4051b5;}
16
  .stTextInput>div>div>input {border: 2px solid #d0d7ff;}
17
- .stButton button {background-color: #4051b5; color: white; border-radius: 6px;}
18
- .stSidebar {background-color: #eaf0ff;}
19
- .sample-dropdown label {font-weight: bold;}
20
  </style>
21
  """, unsafe_allow_html=True)
22
 
23
- st.title("πŸ“˜ Ask Me Anything From Your PDF")
24
- st.caption("Built using RAG (Retrieval-Augmented Generation) ✨")
25
 
26
- st.sidebar.header("πŸ“ Upload PDF")
27
- uploaded_file = st.sidebar.file_uploader("Upload a PDF file", type=["pdf"])
28
 
29
- default_questions = [
30
- "What is machine learning?",
31
- "Explain generalization in ML.",
32
- "What are different types of ML?",
33
- "How is ML used in computer vision?",
34
- "Describe the importance of training data."
35
- ]
36
-
37
- @st.cache_data
38
- def load_pdf(file):
39
- reader = PyPDF2.PdfReader(file)
40
- return [page.extract_text() for page in reader.pages]
41
 
42
  def chunk_text(pages, max_len=1000):
43
  text = " ".join(pages)
44
  words = text.split()
45
  return [' '.join(words[i:i+max_len]) for i in range(0, len(words), max_len)]
46
 
47
- def create_faiss_index(chunks, model):
 
 
 
 
48
  embeddings = model.encode(chunks)
49
  index = faiss.IndexFlatL2(embeddings.shape[1])
50
  index.add(np.array(embeddings))
51
- return index, embeddings
52
-
53
- def retrieve_context(question, chunks, index, model, k=6):
54
- q_embedding = model.encode([question])
55
- _, I = index.search(np.array(q_embedding), k)
56
- return "\n\n".join([chunks[i] for i in I[0]])
57
-
58
- if uploaded_file:
59
- st.success("βœ… PDF uploaded successfully!")
60
-
61
- pages = load_pdf(uploaded_file)
62
- chunks = chunk_text(pages)
63
- model = SentenceTransformer('all-MiniLM-L6-v2')
64
- index, _ = create_faiss_index(chunks, model)
65
- qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
66
-
67
- st.subheader("πŸ’¬ Ask a question")
68
-
69
- col1, col2 = st.columns([3, 1])
70
- with col1:
71
- question = st.text_input("Enter your question here...", placeholder="e.g. What is deep learning?")
72
- with col2:
73
- if st.button("Ask"):
74
- with st.spinner("🧠 Thinking..."):
75
- context = retrieve_context(question, chunks, index, model)
76
- result = qa_pipeline(question=question, context=context)
77
- with st.expander("πŸ“– Answer", expanded=True):
78
- st.markdown(result['answer'])
79
-
80
- st.divider()
81
- st.subheader("✨ Sample Questions")
82
- selected_q = st.selectbox("Pick one to try:", default_questions, key="sample-dropdown")
83
- if st.button("Try Selected Question"):
84
- with st.spinner("⏳ Searching..."):
85
- context = retrieve_context(selected_q, chunks, index, model)
86
- result = qa_pipeline(question=selected_q, context=context)
87
- with st.expander(f"πŸ’‘ Answer to: '{selected_q}'", expanded=True):
88
- st.markdown(result['answer'])
89
-
90
- st.divider()
91
- st.subheader("πŸ“„ Preview PDF Pages")
92
- for i, page in enumerate(pages[:3]):
93
- st.markdown(f"**Page {i+1}**")
94
- st.code(page[:800] + "..." if len(page) > 800 else page)
95
-
96
- else:
97
- st.info("Upload a PDF from the sidebar to begin.")
98
-
 
1
  import streamlit as st
2
  import PyPDF2
3
+ import os
4
  from sentence_transformers import SentenceTransformer
5
  import faiss
6
  import numpy as np
7
  from transformers import pipeline
8
 
9
+ st.set_page_config(page_title="πŸ“˜ PDF RAG QA", layout="wide")
10
 
11
  # Custom styles
12
  st.markdown("""
13
  <style>
14
  .main {background-color: #f7faff;}
15
+ h1 {color: #4a4a8a;}
 
16
  .stTextInput>div>div>input {border: 2px solid #d0d7ff;}
17
+ .stButton button {background-color: #4a4a8a; color: white;}
 
 
18
  </style>
19
  """, unsafe_allow_html=True)
20
 
21
+ st.title("πŸ“˜ Ask Me Anything About Machine Learning")
22
+ st.caption("Using RAG (Retrieval-Augmented Generation) and a preloaded PDF")
23
 
24
+ # Load PDF from local file
25
+ PDF_FILE = "data.pdf"
26
 
27
+ def load_pdf(file_path):
28
+ with open(file_path, "rb") as f:
29
+ reader = PyPDF2.PdfReader(f)
30
+ return [page.extract_text() for page in reader.pages]
 
 
 
 
 
 
 
 
31
 
32
  def chunk_text(pages, max_len=1000):
33
  text = " ".join(pages)
34
  words = text.split()
35
  return [' '.join(words[i:i+max_len]) for i in range(0, len(words), max_len)]
36
 
37
+ @st.cache_resource
38
+ def setup_rag():
39
+ pages = load_pdf(PDF_FILE)
40
+ chunks = chunk_text(pages)
41
+ model = SentenceTransformer('all-MiniLM-L6-v2')
42
  embeddings = model.encode(chunks)
43
  index = faiss.IndexFlatL2(embeddings.shape[1])
44
  index.add(np.array(embeddings))
45
+ qa = pipeline("question-answering", model="deepset/roberta-base-squad2")
46
+ return chunks, model, index, qa
47
+
48
+ def retrieve_answer(question, chunks, model, index, qa_pipeline, k=6):
49
+ q_embed = model.encode([question])
50
+ _, I = index.search(np.array(q_embed), k)
51
+ context = "\n\n".join([chunks[i] for i in I[0]])
52
+ result = qa_pipeline(question=question, context=context)
53
+ return result['answer']
54
+
55
+ chunks, embed_model, faiss_index, qa_model = setup_rag()
56
+
57
+ st.subheader("πŸ’¬ Ask a Question")
58
+ question = st.text_input("Enter your question:", placeholder="e.g., What is supervised learning?")
59
+
60
+ if question:
61
+ with st.spinner("🧠 Searching for the answer..."):
62
+ answer = retrieve_answer(question, chunks, embed_model, faiss_index, qa_model)
63
+ st.markdown("#### πŸ“– Answer:")
64
+ st.write(answer)