pradeepsengarr commited on
Commit
b58d6fd
Β·
verified Β·
1 Parent(s): 7afdcd2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +201 -201
app.py CHANGED
@@ -1,225 +1,225 @@
1
- # import os
2
- # import streamlit as st
3
- # import fitz # PyMuPDF
4
- # import logging
5
- # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
6
- # from langchain.text_splitter import RecursiveCharacterTextSplitter
7
- # from langchain_community.vectorstores import Chroma
8
- # from langchain_community.embeddings import SentenceTransformerEmbeddings
9
- # from langchain_community.llms import HuggingFacePipeline
10
- # from langchain.chains import RetrievalQA
11
- # from langchain.prompts import PromptTemplate
12
- # from langchain_community.document_loaders import TextLoader
13
-
14
- # # --- Configuration ---
15
- # st.set_page_config(page_title="πŸ“š RAG PDF Chatbot", layout="wide")
16
- # st.title("πŸ“š RAG-based PDF Chatbot")
17
- # device = "cpu"
18
-
19
- # # --- Logging ---
20
- # logging.basicConfig(level=logging.INFO)
21
-
22
- # # --- Load LLM ---
23
- # @st.cache_resource
24
- # def load_model():
25
- # checkpoint = "MBZUAI/LaMini-T5-738M"
26
- # tokenizer = AutoTokenizer.from_pretrained(checkpoint)
27
- # model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
28
- # pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95)
29
- # return HuggingFacePipeline(pipeline=pipe)
30
-
31
- # # --- Extract PDF Text ---
32
- # def read_pdf(file):
33
- # try:
34
- # doc = fitz.open(stream=file.read(), filetype="pdf")
35
- # text = ""
36
- # for page in doc:
37
- # text += page.get_text()
38
- # return text.strip()
39
- # except Exception as e:
40
- # logging.error(f"Failed to extract text: {e}")
41
- # return ""
42
-
43
- # # --- Process Answer ---dd
44
- # def process_answer(question, full_text):
45
- # # Save the full_text to a temporary file
46
- # with open("temp_text.txt", "w") as f:
47
- # f.write(full_text)
48
-
49
- # loader = TextLoader("temp_text.txt")
50
- # docs = loader.load()
51
-
52
- # # Chunk the documents with increased size and overlap
53
- # text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=300)
54
- # splits = text_splitter.split_documents(docs)
55
-
56
- # # Load embeddings
57
- # embeddings = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5")
58
-
59
- # # Create Chroma in-memory vector store
60
- # db = Chroma.from_documents(splits, embedding=embeddings)
61
- # retriever = db.as_retriever()
62
-
63
- # # Set up the model
64
- # llm = load_model()
65
-
66
- # # Create a custom prompt
67
- # prompt_template = PromptTemplate(
68
- # input_variables=["context", "question"],
69
- # template="""
70
- # You are a helpful assistant. Carefully analyze the given context and extract direct answers ONLY from it.
71
 
72
- # Context:
73
- # {context}
74
 
75
- # Question:
76
- # {question}
77
 
78
- # Important Instructions:
79
- # - If the question asks for a URL (e.g., LinkedIn link), provide the exact URL as it appears.
80
- # - Do NOT summarize or paraphrase.
81
- # - If the information is not in the context, say "Not found in the document."
82
 
83
- # Answer:
84
- # """)
85
 
86
 
87
- # # Retrieval QA with custom prompt
88
- # qa_chain = RetrievalQA.from_chain_type(
89
- # llm=llm,
90
- # retriever=retriever,
91
- # chain_type="stuff",
92
- # chain_type_kwargs={"prompt": prompt_template}
93
- # )
94
 
95
- # # Return the answer using the retrieval QA chain
96
- # return qa_chain.run(question)
97
 
98
- # # --- UI Layout ---
99
- # with st.sidebar:
100
- # st.header("πŸ“„ Upload PDF")
101
- # uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
102
 
103
- # # --- Main Interface ---
104
- # if uploaded_file:
105
- # st.success(f"You uploaded: {uploaded_file.name}")
106
- # full_text = read_pdf(uploaded_file)
107
 
108
- # if full_text:
109
- # st.subheader("πŸ“ PDF Preview")
110
- # with st.expander("View Extracted Text"):
111
- # st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
112
 
113
- # st.subheader("πŸ’¬ Ask a Question")
114
- # user_question = st.text_input("Type your question about the PDF content")
115
 
116
- # if user_question:
117
- # with st.spinner("Thinking..."):
118
- # answer = process_answer(user_question, full_text)
119
- # st.markdown("### πŸ€– Answer")
120
- # st.write(answer)
121
-
122
- # with st.sidebar:
123
- # st.markdown("---")
124
- # st.markdown("**πŸ’‘ Suggestions:**")
125
- # st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
126
- # with st.expander("πŸ’‘ Suggestions", expanded=True):
127
- # st.markdown("""
128
- # - "Summarize this document"
129
- # - "Give a quick summary"
130
- # - "What are the main points?"
131
- # - "Explain this document in short"
132
- # """)
133
-
134
- # else:
135
- # st.error("⚠️ No text could be extracted from the PDF. Try another file.")
136
- # else:
137
- # st.info("Upload a PDF to begin.")
138
 
139
 
140
- import os
141
- import streamlit as st
142
- from langchain_community.document_loaders import PyPDFLoader
143
- from langchain_text_splitters import RecursiveCharacterTextSplitter
144
- from langchain_community.embeddings import HuggingFaceEmbeddings
145
- from langchain_community.vectorstores import FAISS
146
- from langchain.chains import RetrievalQA
147
- from langchain.prompts import PromptTemplate
148
- from langchain.llms import HuggingFaceHub
149
-
150
- # Set your Hugging Face API token here
151
- os.environ["HUGGINGFACEHUB_API_TOKEN"] = "your_hf_token_here"
152
-
153
- # Load and split PDF
154
- def load_and_split_pdf(uploaded_file):
155
- with open("temp.pdf", "wb") as f:
156
- f.write(uploaded_file.read())
157
- loader = PyPDFLoader("temp.pdf")
158
- documents = loader.load()
159
-
160
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
161
- chunks = text_splitter.split_documents(documents)
162
- return chunks
163
-
164
- # Build vectorstore
165
- def build_vectorstore(chunks):
166
- embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
167
- vectorstore = FAISS.from_documents(chunks, embedding=embedding_model)
168
- return vectorstore
169
-
170
- # Load Lamini or other HF model
171
- def get_llm():
172
- return HuggingFaceHub(
173
- repo_id="lamini/lamini-13b-chat",
174
- model_kwargs={"temperature": 0.2, "max_new_tokens": 512}
175
- )
176
 
177
- # Create prompt template (optional for better accuracy)
178
- custom_prompt = PromptTemplate(
179
- input_variables=["context", "question"],
180
- template="""
181
- You are a helpful assistant. Use the following context to answer the question as accurately as possible.
182
- If the answer is not in the context, respond with "Not found in the document."
183
 
184
- Context:
185
- {context}
186
 
187
- Question: {question}
188
 
189
- Answer:"""
190
- )
191
 
192
- # Build QA chain
193
- def build_qa_chain(vectorstore):
194
- llm = get_llm()
195
- qa_chain = RetrievalQA.from_chain_type(
196
- llm=llm,
197
- retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
198
- chain_type_kwargs={"prompt": custom_prompt}
199
- )
200
- return qa_chain
201
 
202
- # Streamlit UI
203
- def main():
204
- st.set_page_config(page_title="PDF Chatbot", layout="wide")
205
- st.title("Chat with your PDF")
206
 
207
- uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
208
 
209
- if uploaded_file:
210
- st.success("PDF uploaded successfully!")
211
- with st.spinner("Processing PDF..."):
212
- chunks = load_and_split_pdf(uploaded_file)
213
- vectorstore = build_vectorstore(chunks)
214
- qa_chain = build_qa_chain(vectorstore)
215
- st.success("Ready to chat!")
216
 
217
- user_question = st.text_input("Ask a question based on the PDF:")
218
- if user_question:
219
- with st.spinner("Generating answer..."):
220
- result = qa_chain.run(user_question)
221
- st.markdown("**Answer:**")
222
- st.write(result)
223
 
224
- if __name__ == "__main__":
225
- main()
 
1
+ import os
2
+ import streamlit as st
3
+ import fitz # PyMuPDF
4
+ import logging
5
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain_community.vectorstores import Chroma
8
+ from langchain_community.embeddings import SentenceTransformerEmbeddings
9
+ from langchain_community.llms import HuggingFacePipeline
10
+ from langchain.chains import RetrievalQA
11
+ from langchain.prompts import PromptTemplate
12
+ from langchain_community.document_loaders import TextLoader
13
+
14
+ # --- Configuration ---
15
+ st.set_page_config(page_title="πŸ“š RAG PDF Chatbot", layout="wide")
16
+ st.title("πŸ“š RAG-based PDF Chatbot")
17
+ device = "cpu"
18
+
19
+ # --- Logging ---
20
+ logging.basicConfig(level=logging.INFO)
21
+
22
+ # --- Load LLM ---
23
+ @st.cache_resource
24
+ def load_model():
25
+ checkpoint = "MBZUAI/LaMini-T5-738M"
26
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
27
+ model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
28
+ pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95)
29
+ return HuggingFacePipeline(pipeline=pipe)
30
+
31
+ # --- Extract PDF Text ---
32
+ def read_pdf(file):
33
+ try:
34
+ doc = fitz.open(stream=file.read(), filetype="pdf")
35
+ text = ""
36
+ for page in doc:
37
+ text += page.get_text()
38
+ return text.strip()
39
+ except Exception as e:
40
+ logging.error(f"Failed to extract text: {e}")
41
+ return ""
42
+
43
+ # --- Process Answer ---dd
44
+ def process_answer(question, full_text):
45
+ # Save the full_text to a temporary file
46
+ with open("temp_text.txt", "w") as f:
47
+ f.write(full_text)
48
+
49
+ loader = TextLoader("temp_text.txt")
50
+ docs = loader.load()
51
+
52
+ # Chunk the documents with increased size and overlap
53
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=300)
54
+ splits = text_splitter.split_documents(docs)
55
+
56
+ # Load embeddings
57
+ embeddings = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5")
58
+
59
+ # Create Chroma in-memory vector store
60
+ db = Chroma.from_documents(splits, embedding=embeddings)
61
+ retriever = db.as_retriever()
62
+
63
+ # Set up the model
64
+ llm = load_model()
65
+
66
+ # Create a custom prompt
67
+ prompt_template = PromptTemplate(
68
+ input_variables=["context", "question"],
69
+ template="""
70
+ You are a helpful assistant. Carefully analyze the given context and extract direct answers ONLY from it.
71
 
72
+ Context:
73
+ {context}
74
 
75
+ Question:
76
+ {question}
77
 
78
+ Important Instructions:
79
+ - If the question asks for a URL (e.g., LinkedIn link), provide the exact URL as it appears.
80
+ - Do NOT summarize or paraphrase.
81
+ - If the information is not in the context, say "Not found in the document."
82
 
83
+ Answer:
84
+ """)
85
 
86
 
87
+ # Retrieval QA with custom prompt
88
+ qa_chain = RetrievalQA.from_chain_type(
89
+ llm=llm,
90
+ retriever=retriever,
91
+ chain_type="stuff",
92
+ chain_type_kwargs={"prompt": prompt_template}
93
+ )
94
 
95
+ # Return the answer using the retrieval QA chain
96
+ return qa_chain.run(question)
97
 
98
+ # --- UI Layout ---
99
+ with st.sidebar:
100
+ st.header("πŸ“„ Upload PDF")
101
+ uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
102
 
103
+ # --- Main Interface ---
104
+ if uploaded_file:
105
+ st.success(f"You uploaded: {uploaded_file.name}")
106
+ full_text = read_pdf(uploaded_file)
107
 
108
+ if full_text:
109
+ st.subheader("πŸ“ PDF Preview")
110
+ with st.expander("View Extracted Text"):
111
+ st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
112
 
113
+ st.subheader("πŸ’¬ Ask a Question")
114
+ user_question = st.text_input("Type your question about the PDF content")
115
 
116
+ if user_question:
117
+ with st.spinner("Thinking..."):
118
+ answer = process_answer(user_question, full_text)
119
+ st.markdown("### πŸ€– Answer")
120
+ st.write(answer)
121
+
122
+ with st.sidebar:
123
+ st.markdown("---")
124
+ st.markdown("**πŸ’‘ Suggestions:**")
125
+ st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
126
+ with st.expander("πŸ’‘ Suggestions", expanded=True):
127
+ st.markdown("""
128
+ - "Summarize this document"
129
+ - "Give a quick summary"
130
+ - "What are the main points?"
131
+ - "Explain this document in short"
132
+ """)
133
+
134
+ else:
135
+ st.error("⚠️ No text could be extracted from the PDF. Try another file.")
136
+ else:
137
+ st.info("Upload a PDF to begin.")
138
 
139
 
140
+ # import os
141
+ # import streamlit as st
142
+ # from langchain_community.document_loaders import PyPDFLoader
143
+ # from langchain_text_splitters import RecursiveCharacterTextSplitter
144
+ # from langchain_community.embeddings import HuggingFaceEmbeddings
145
+ # from langchain_community.vectorstores import FAISS
146
+ # from langchain.chains import RetrievalQA
147
+ # from langchain.prompts import PromptTemplate
148
+ # from langchain.llms import HuggingFaceHub
149
+
150
+ # # Set your Hugging Face API token here
151
+ # os.environ["HUGGINGFACEHUB_API_TOKEN"] = "your_hf_token_here"
152
+
153
+ # # Load and split PDF
154
+ # def load_and_split_pdf(uploaded_file):
155
+ # with open("temp.pdf", "wb") as f:
156
+ # f.write(uploaded_file.read())
157
+ # loader = PyPDFLoader("temp.pdf")
158
+ # documents = loader.load()
159
+
160
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
161
+ # chunks = text_splitter.split_documents(documents)
162
+ # return chunks
163
+
164
+ # # Build vectorstore
165
+ # def build_vectorstore(chunks):
166
+ # embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
167
+ # vectorstore = FAISS.from_documents(chunks, embedding=embedding_model)
168
+ # return vectorstore
169
+
170
+ # # Load Lamini or other HF model
171
+ # def get_llm():
172
+ # return HuggingFaceHub(
173
+ # repo_id="lamini/lamini-13b-chat",
174
+ # model_kwargs={"temperature": 0.2, "max_new_tokens": 512}
175
+ # )
176
 
177
+ # # Create prompt template (optional for better accuracy)
178
+ # custom_prompt = PromptTemplate(
179
+ # input_variables=["context", "question"],
180
+ # template="""
181
+ # You are a helpful assistant. Use the following context to answer the question as accurately as possible.
182
+ # If the answer is not in the context, respond with "Not found in the document."
183
 
184
+ # Context:
185
+ # {context}
186
 
187
+ # Question: {question}
188
 
189
+ # Answer:"""
190
+ # )
191
 
192
+ # # Build QA chain
193
+ # def build_qa_chain(vectorstore):
194
+ # llm = get_llm()
195
+ # qa_chain = RetrievalQA.from_chain_type(
196
+ # llm=llm,
197
+ # retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
198
+ # chain_type_kwargs={"prompt": custom_prompt}
199
+ # )
200
+ # return qa_chain
201
 
202
+ # # Streamlit UI
203
+ # def main():
204
+ # st.set_page_config(page_title="PDF Chatbot", layout="wide")
205
+ # st.title("Chat with your PDF")
206
 
207
+ # uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
208
 
209
+ # if uploaded_file:
210
+ # st.success("PDF uploaded successfully!")
211
+ # with st.spinner("Processing PDF..."):
212
+ # chunks = load_and_split_pdf(uploaded_file)
213
+ # vectorstore = build_vectorstore(chunks)
214
+ # qa_chain = build_qa_chain(vectorstore)
215
+ # st.success("Ready to chat!")
216
 
217
+ # user_question = st.text_input("Ask a question based on the PDF:")
218
+ # if user_question:
219
+ # with st.spinner("Generating answer..."):
220
+ # result = qa_chain.run(user_question)
221
+ # st.markdown("**Answer:**")
222
+ # st.write(result)
223
 
224
+ # if __name__ == "__main__":
225
+ # main()