rahideer commited on
Commit
d28742c
Β·
verified Β·
1 Parent(s): dc5042a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -114
app.py CHANGED
@@ -1,122 +1,98 @@
1
  import streamlit as st
2
- from PyPDF2 import PdfReader
3
  from sentence_transformers import SentenceTransformer
4
- from transformers import pipeline
5
  import faiss
6
  import numpy as np
 
7
 
8
- # ---------- Custom CSS for UI ----------
9
- def apply_custom_style():
10
- st.markdown("""
11
- <style>
12
- html, body, [class*="css"] {
13
- font-family: 'Segoe UI', sans-serif;
14
- background-color: #f0f4ff;
15
- }
16
- .title {
17
- background: linear-gradient(to right, #4a90e2, #00c6ff);
18
- -webkit-background-clip: text;
19
- -webkit-text-fill-color: transparent;
20
- font-size: 2.5em;
21
- font-weight: bold;
22
- }
23
- .subtitle {
24
- color: #444;
25
- font-size: 1.2em;
26
- margin-bottom: 1rem;
27
- }
28
- .question-box {
29
- background-color: #fff;
30
- padding: 1rem;
31
- border-radius: 10px;
32
- box-shadow: 0px 2px 10px rgba(0,0,0,0.1);
33
- margin-bottom: 1rem;
34
- }
35
- .example {
36
- color: #444;
37
- background: #e9f0ff;
38
- padding: 0.5rem;
39
- border-radius: 8px;
40
- margin: 3px 0;
41
- cursor: pointer;
42
- }
43
- </style>
44
- """, unsafe_allow_html=True)
45
-
46
- # ---------- PDF Reading ----------
47
- def load_pdf_text(pdf_path):
48
- reader = PdfReader(pdf_path)
49
- text = ''
50
- for page in reader.pages:
51
- if page.extract_text():
52
- text += page.extract_text()
53
- return text
54
-
55
- # ---------- Chunking ----------
56
- def chunk_text(text, max_len=500):
57
- sentences = text.split('. ')
58
- chunks, chunk = [], ''
59
- for sentence in sentences:
60
- if len(chunk) + len(sentence) <= max_len:
61
- chunk += sentence + '. '
62
- else:
63
- chunks.append(chunk.strip())
64
- chunk = sentence + '. '
65
- if chunk:
66
- chunks.append(chunk.strip())
67
- return chunks
68
-
69
- # ---------- Embedding ----------
70
- @st.cache_resource
71
- def embed_chunks(chunks):
72
- model = SentenceTransformer('all-MiniLM-L6-v2')
73
- embeddings = model.encode(chunks)
74
- return embeddings, model
75
 
76
- # ---------- RAG-Based QA ----------
77
- def answer_query(query, embeddings, chunks, model, qa_pipeline):
78
- query_embedding = model.encode([query])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  index = faiss.IndexFlatL2(embeddings.shape[1])
80
  index.add(np.array(embeddings))
81
- _, I = index.search(np.array(query_embedding), k=5) # retrieve top 5 chunks
82
- context = "\n\n".join([chunks[i] for i in I[0]]) # longer, better context
83
- result = qa_pipeline(question=query, context=context)
84
- return result['answer']
85
-
86
- # ---------- App Layout ----------
87
- apply_custom_style()
88
- st.markdown('<div class="title">πŸ€– RAG PDF Q&A App</div>', unsafe_allow_html=True)
89
- st.markdown('<div class="subtitle">Ask questions about a machine learning PDF. Powered by Transformers and FAISS!</div>', unsafe_allow_html=True)
90
-
91
- # ---------- Load PDF ----------
92
- pdf_path = "ml_dataset_25_pages.pdf"
93
- raw_text = load_pdf_text(pdf_path)
94
- chunks = chunk_text(raw_text)
95
- embeddings, embedder = embed_chunks(chunks)
96
-
97
- # ---------- QA Pipeline ----------
98
- qa = pipeline(
99
- "question-answering",
100
- model="deepset/roberta-base-squad2",
101
- tokenizer="deepset/roberta-base-squad2"
102
- )
103
-
104
- # ---------- Sample Questions ----------
105
- st.markdown('<div class="question-box"><strong>πŸ’‘ Sample Questions:</strong>', unsafe_allow_html=True)
106
- sample_questions = [
107
- "What is supervised learning?",
108
- "Explain the difference between regression and classification.",
109
- "What are the applications of machine learning?",
110
- "How does decision tree algorithm work?",
111
- "What is overfitting in machine learning?"
112
- ]
113
- for q in sample_questions:
114
- st.markdown(f'<div class="example">{q}</div>', unsafe_allow_html=True)
115
- st.markdown('</div>', unsafe_allow_html=True)
116
-
117
- # ---------- User Query ----------
118
- query = st.text_input("πŸ”Ž Ask your question here:")
119
- if query:
120
- with st.spinner("Thinking..."):
121
- answer = answer_query(query, embeddings, chunks, embedder, qa)
122
- st.success(f"🧠 Answer: {answer}")
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import PyPDF2
3
  from sentence_transformers import SentenceTransformer
 
4
  import faiss
5
  import numpy as np
6
+ from transformers import pipeline
7
 
8
+ st.set_page_config(page_title="πŸ“˜ PDF QA RAG App", layout="wide")
9
+
10
+ # Custom styles
11
+ st.markdown("""
12
+ <style>
13
+ .main {background-color: #f7faff;}
14
+ .block-container {padding-top: 2rem;}
15
+ h1 {color: #4051b5;}
16
+ .stTextInput>div>div>input {border: 2px solid #d0d7ff;}
17
+ .stButton button {background-color: #4051b5; color: white; border-radius: 6px;}
18
+ .stSidebar {background-color: #eaf0ff;}
19
+ .sample-dropdown label {font-weight: bold;}
20
+ </style>
21
+ """, unsafe_allow_html=True)
22
+
23
+ st.title("πŸ“˜ Ask Me Anything From Your PDF")
24
+ st.caption("Built using RAG (Retrieval-Augmented Generation) ✨")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ st.sidebar.header("πŸ“ Upload PDF")
27
+ uploaded_file = st.sidebar.file_uploader("Upload a PDF file", type=["pdf"])
28
+
29
+ default_questions = [
30
+ "What is machine learning?",
31
+ "Explain generalization in ML.",
32
+ "What are different types of ML?",
33
+ "How is ML used in computer vision?",
34
+ "Describe the importance of training data."
35
+ ]
36
+
37
+ @st.cache_data
38
+ def load_pdf(file):
39
+ reader = PyPDF2.PdfReader(file)
40
+ return [page.extract_text() for page in reader.pages]
41
+
42
+ def chunk_text(pages, max_len=1000):
43
+ text = " ".join(pages)
44
+ words = text.split()
45
+ return [' '.join(words[i:i+max_len]) for i in range(0, len(words), max_len)]
46
+
47
+ def create_faiss_index(chunks, model):
48
+ embeddings = model.encode(chunks)
49
  index = faiss.IndexFlatL2(embeddings.shape[1])
50
  index.add(np.array(embeddings))
51
+ return index, embeddings
52
+
53
+ def retrieve_context(question, chunks, index, model, k=6):
54
+ q_embedding = model.encode([question])
55
+ _, I = index.search(np.array(q_embedding), k)
56
+ return "\n\n".join([chunks[i] for i in I[0]])
57
+
58
+ if uploaded_file:
59
+ st.success("βœ… PDF uploaded successfully!")
60
+
61
+ pages = load_pdf(uploaded_file)
62
+ chunks = chunk_text(pages)
63
+ model = SentenceTransformer('all-MiniLM-L6-v2')
64
+ index, _ = create_faiss_index(chunks, model)
65
+ qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
66
+
67
+ st.subheader("πŸ’¬ Ask a question")
68
+
69
+ col1, col2 = st.columns([3, 1])
70
+ with col1:
71
+ question = st.text_input("Enter your question here...", placeholder="e.g. What is deep learning?")
72
+ with col2:
73
+ if st.button("Ask"):
74
+ with st.spinner("🧠 Thinking..."):
75
+ context = retrieve_context(question, chunks, index, model)
76
+ result = qa_pipeline(question=question, context=context)
77
+ with st.expander("πŸ“– Answer", expanded=True):
78
+ st.markdown(result['answer'])
79
+
80
+ st.divider()
81
+ st.subheader("✨ Sample Questions")
82
+ selected_q = st.selectbox("Pick one to try:", default_questions, key="sample-dropdown")
83
+ if st.button("Try Selected Question"):
84
+ with st.spinner("⏳ Searching..."):
85
+ context = retrieve_context(selected_q, chunks, index, model)
86
+ result = qa_pipeline(question=selected_q, context=context)
87
+ with st.expander(f"πŸ’‘ Answer to: '{selected_q}'", expanded=True):
88
+ st.markdown(result['answer'])
89
+
90
+ st.divider()
91
+ st.subheader("πŸ“„ Preview PDF Pages")
92
+ for i, page in enumerate(pages[:3]):
93
+ st.markdown(f"**Page {i+1}**")
94
+ st.code(page[:800] + "..." if len(page) > 800 else page)
95
+
96
+ else:
97
+ st.info("Upload a PDF from the sidebar to begin.")
98
+