Spaces:
Sleeping
Sleeping
import os | |
import shutil | |
import tempfile | |
import fitz # PyMuPDF | |
import streamlit as st | |
import logging | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.vectorstores import Chroma | |
from langchain_community.embeddings import SentenceTransformerEmbeddings | |
from langchain.chains import RetrievalQA | |
from langchain_community.llms import HuggingFacePipeline | |
from langchain.prompts import PromptTemplate | |
from langchain_community.document_loaders import TextLoader | |
# --- Streamlit Config --- | |
st.set_page_config(page_title="π RAG PDF Chatbot", layout="wide") | |
st.title("π RAG-based PDF Chatbot") | |
# --- Logging --- | |
logging.basicConfig(level=logging.INFO) | |
# --- Load Model --- | |
def load_model(): | |
checkpoint = "MBZUAI/LaMini-T5-738M" | |
tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) | |
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512) | |
return HuggingFacePipeline(pipeline=pipe) | |
# --- Extract PDF Text --- | |
def extract_text_from_pdf(file): | |
try: | |
doc = fitz.open(stream=file.read(), filetype="pdf") | |
return "\n".join([page.get_text() for page in doc]) | |
except Exception as e: | |
logging.error(f"Error reading PDF: {e}") | |
return "" | |
# --- Create Chroma Vectorstore Safely --- | |
def create_vectorstore(documents, embeddings): | |
temp_dir = tempfile.mkdtemp() # unique, writable temp dir | |
db = Chroma.from_documents(documents, embedding=embeddings, persist_directory=temp_dir) | |
return db | |
# --- Build RAG QA Chain --- | |
def build_qa_chain(retriever, llm): | |
prompt_template = PromptTemplate( | |
input_variables=["context", "question"], | |
template=""" | |
You are a helpful assistant. Use the context below to answer the user's question as accurately and truthfully as possible. | |
Context: | |
{context} | |
Question: | |
{question} | |
Helpful Answer: | |
""" | |
) | |
return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type_kwargs={"prompt": prompt_template}) | |
# --- Process QA --- | |
def process_question(question, full_text): | |
# Write PDF text to temp file | |
with open("temp_text.txt", "w") as f: | |
f.write(full_text) | |
loader = TextLoader("temp_text.txt") | |
docs = loader.load() | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) | |
chunks = text_splitter.split_documents(docs) | |
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") | |
vectorstore = create_vectorstore(chunks, embeddings) | |
retriever = vectorstore.as_retriever() | |
llm = load_model() | |
qa = build_qa_chain(retriever, llm) | |
return qa.run(question) | |
# --- Sidebar Upload --- | |
with st.sidebar: | |
st.header("π Upload your PDF") | |
uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"]) | |
# --- Main Logic --- | |
if uploaded_file: | |
st.success(f"Uploaded: {uploaded_file.name}") | |
full_text = extract_text_from_pdf(uploaded_file) | |
if full_text: | |
with st.expander("π View Extracted PDF Text", expanded=False): | |
st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else "")) | |
st.subheader("π¬ Ask Something") | |
user_question = st.text_input("Ask a question about the document") | |
if user_question: | |
with st.spinner("Analyzing..."): | |
try: | |
answer = process_question(user_question, full_text) | |
except Exception as e: | |
st.error("β οΈ Something went wrong. Try re-uploading the PDF.") | |
st.stop() | |
st.markdown("### π€ Answer") | |
st.write(answer) | |
with st.sidebar: | |
st.markdown("---") | |
st.caption("π‘ Sample Questions") | |
st.markdown(""" | |
- "Summarize the document" | |
- "What is the experience of Pradeep Singh Sengar?" | |
- "What are the key points?" | |
- "Explain in short" | |
""") | |
else: | |
st.error("β Could not extract text. Try a different PDF.") | |
else: | |
st.info("Upload a PDF to get started.") | |