Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -137,88 +137,89 @@
|
|
137 |
# st.info("Upload a PDF to begin.")
|
138 |
|
139 |
|
140 |
-
|
141 |
import streamlit as st
|
142 |
from langchain_community.document_loaders import PyPDFLoader
|
143 |
-
from
|
|
|
144 |
from langchain_community.vectorstores import FAISS
|
145 |
-
from langchain.embeddings import HuggingFaceEmbeddings
|
146 |
from langchain.chains import RetrievalQA
|
147 |
from langchain.prompts import PromptTemplate
|
148 |
from langchain.llms import HuggingFaceHub
|
149 |
-
import os
|
150 |
|
151 |
-
# Set Hugging Face API
|
152 |
-
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
custom_prompt = PromptTemplate(
|
156 |
input_variables=["context", "question"],
|
157 |
template="""
|
158 |
-
You are a helpful assistant. Use the context
|
159 |
-
If the answer is not in the context,
|
160 |
|
161 |
Context:
|
162 |
{context}
|
163 |
|
164 |
-
Question:
|
165 |
-
{question}
|
166 |
|
167 |
-
Answer:
|
168 |
-
"""
|
169 |
)
|
170 |
|
171 |
-
# Load PDF and split into chunks
|
172 |
-
|
173 |
-
from langchain_community.document_loaders import PyPDFLoader
|
174 |
-
import tempfile
|
175 |
-
|
176 |
-
def load_and_split_pdf(uploaded_file):
|
177 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
178 |
-
tmp_file.write(uploaded_file.read())
|
179 |
-
tmp_file_path = tmp_file.name
|
180 |
-
|
181 |
-
loader = PyPDFLoader(tmp_file_path)
|
182 |
-
documents = loader.load()
|
183 |
-
|
184 |
-
# Then your text splitting logic follows
|
185 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
186 |
-
chunks = text_splitter.split_documents(documents)
|
187 |
-
return chunks
|
188 |
-
|
189 |
-
# Build vectorstore from document chunks
|
190 |
-
def build_vectorstore(chunks):
|
191 |
-
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
192 |
-
db = FAISS.from_documents(chunks, embedding=embeddings)
|
193 |
-
return db
|
194 |
-
|
195 |
# Build QA chain
|
196 |
def build_qa_chain(vectorstore):
|
197 |
-
llm =
|
198 |
qa_chain = RetrievalQA.from_chain_type(
|
199 |
llm=llm,
|
200 |
-
retriever=vectorstore.as_retriever(),
|
201 |
-
chain_type="stuff",
|
202 |
chain_type_kwargs={"prompt": custom_prompt}
|
203 |
)
|
204 |
return qa_chain
|
205 |
|
206 |
-
# Streamlit
|
207 |
-
|
208 |
-
st.
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
137 |
# st.info("Upload a PDF to begin.")
|
138 |
|
139 |
|
140 |
+
import os
|
141 |
import streamlit as st
|
142 |
from langchain_community.document_loaders import PyPDFLoader
|
143 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
144 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
145 |
from langchain_community.vectorstores import FAISS
|
|
|
146 |
from langchain.chains import RetrievalQA
|
147 |
from langchain.prompts import PromptTemplate
|
148 |
from langchain.llms import HuggingFaceHub
|
|
|
149 |
|
150 |
+
# Set your Hugging Face API token here
|
151 |
+
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "your_hf_token_here"
|
152 |
+
|
153 |
+
# Load and split PDF
|
154 |
+
def load_and_split_pdf(uploaded_file):
|
155 |
+
with open("temp.pdf", "wb") as f:
|
156 |
+
f.write(uploaded_file.read())
|
157 |
+
loader = PyPDFLoader("temp.pdf")
|
158 |
+
documents = loader.load()
|
159 |
+
|
160 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
|
161 |
+
chunks = text_splitter.split_documents(documents)
|
162 |
+
return chunks
|
163 |
|
164 |
+
# Build vectorstore
|
165 |
+
def build_vectorstore(chunks):
|
166 |
+
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
167 |
+
vectorstore = FAISS.from_documents(chunks, embedding=embedding_model)
|
168 |
+
return vectorstore
|
169 |
+
|
170 |
+
# Load Lamini or other HF model
|
171 |
+
def get_llm():
|
172 |
+
return HuggingFaceHub(
|
173 |
+
repo_id="lamini/lamini-13b-chat",
|
174 |
+
model_kwargs={"temperature": 0.2, "max_new_tokens": 512}
|
175 |
+
)
|
176 |
+
|
177 |
+
# Create prompt template (optional for better accuracy)
|
178 |
custom_prompt = PromptTemplate(
|
179 |
input_variables=["context", "question"],
|
180 |
template="""
|
181 |
+
You are a helpful assistant. Use the following context to answer the question as accurately as possible.
|
182 |
+
If the answer is not in the context, respond with "Not found in the document."
|
183 |
|
184 |
Context:
|
185 |
{context}
|
186 |
|
187 |
+
Question: {question}
|
|
|
188 |
|
189 |
+
Answer:"""
|
|
|
190 |
)
|
191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
# Build QA chain
|
193 |
def build_qa_chain(vectorstore):
|
194 |
+
llm = get_llm()
|
195 |
qa_chain = RetrievalQA.from_chain_type(
|
196 |
llm=llm,
|
197 |
+
retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
|
|
|
198 |
chain_type_kwargs={"prompt": custom_prompt}
|
199 |
)
|
200 |
return qa_chain
|
201 |
|
202 |
+
# Streamlit UI
|
203 |
+
def main():
|
204 |
+
st.set_page_config(page_title="PDF Chatbot", layout="wide")
|
205 |
+
st.title("Chat with your PDF")
|
206 |
+
|
207 |
+
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
|
208 |
+
|
209 |
+
if uploaded_file:
|
210 |
+
st.success("PDF uploaded successfully!")
|
211 |
+
with st.spinner("Processing PDF..."):
|
212 |
+
chunks = load_and_split_pdf(uploaded_file)
|
213 |
+
vectorstore = build_vectorstore(chunks)
|
214 |
+
qa_chain = build_qa_chain(vectorstore)
|
215 |
+
st.success("Ready to chat!")
|
216 |
+
|
217 |
+
user_question = st.text_input("Ask a question based on the PDF:")
|
218 |
+
if user_question:
|
219 |
+
with st.spinner("Generating answer..."):
|
220 |
+
result = qa_chain.run(user_question)
|
221 |
+
st.markdown("**Answer:**")
|
222 |
+
st.write(result)
|
223 |
+
|
224 |
+
if __name__ == "__main__":
|
225 |
+
main()
|