Spaces:
Sleeping
Sleeping
File size: 4,240 Bytes
709f6b7 cb0ff81 6ccf2cb cb0ff81 8d47fc3 cb0ff81 709f6b7 bbd8a88 709f6b7 cb0ff81 6b52351 cb0ff81 709f6b7 8d47fc3 bbd8a88 3875c87 bbd8a88 709f6b7 cb0ff81 bbd8a88 cb0ff81 bbd8a88 dea11f3 bbd8a88 cb0ff81 8d47fc3 3875c87 bbd8a88 cb0ff81 3875c87 8d47fc3 bbd8a88 6956d92 cb0ff81 8d47fc3 cb0ff81 6b52351 cb0ff81 6b52351 cb0ff81 6b52351 cb0ff81 6b52351 cb0ff81 920b3d6 cb0ff81 8d47fc3 7c797e6 cb0ff81 bbd8a88 cb0ff81 bbd8a88 cb0ff81 bbd8a88 8d47fc3 bbd8a88 cb0ff81 bbd8a88 cb0ff81 dea11f3 bbd8a88 cb0ff81 bbd8a88 cb0ff81 39d36c9 cb0ff81 39d36c9 bbd8a88 cb0ff81 3875c87 cb0ff81 e428e3e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import os
import shutil
import tempfile
import fitz # PyMuPDF
import streamlit as st
import logging
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import TextLoader
# --- Streamlit Config ---
st.set_page_config(page_title="π RAG PDF Chatbot", layout="wide")
st.title("π RAG-based PDF Chatbot")
# --- Logging ---
logging.basicConfig(level=logging.INFO)
# --- Load Model ---
@st.cache_resource
def load_model():
checkpoint = "MBZUAI/LaMini-T5-738M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
return HuggingFacePipeline(pipeline=pipe)
# --- Extract PDF Text ---
def extract_text_from_pdf(file):
try:
doc = fitz.open(stream=file.read(), filetype="pdf")
return "\n".join([page.get_text() for page in doc])
except Exception as e:
logging.error(f"Error reading PDF: {e}")
return ""
# --- Create Chroma Vectorstore Safely ---
def create_vectorstore(documents, embeddings):
temp_dir = tempfile.mkdtemp() # unique, writable temp dir
db = Chroma.from_documents(documents, embedding=embeddings, persist_directory=temp_dir)
return db
# --- Build RAG QA Chain ---
def build_qa_chain(retriever, llm):
prompt_template = PromptTemplate(
input_variables=["context", "question"],
template="""
You are a helpful assistant. Use the context below to answer the user's question as accurately and truthfully as possible.
Context:
{context}
Question:
{question}
Helpful Answer:
"""
)
return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type_kwargs={"prompt": prompt_template})
# --- Process QA ---
def process_question(question, full_text):
# Write PDF text to temp file
with open("temp_text.txt", "w") as f:
f.write(full_text)
loader = TextLoader("temp_text.txt")
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
chunks = text_splitter.split_documents(docs)
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = create_vectorstore(chunks, embeddings)
retriever = vectorstore.as_retriever()
llm = load_model()
qa = build_qa_chain(retriever, llm)
return qa.run(question)
# --- Sidebar Upload ---
with st.sidebar:
st.header("π Upload your PDF")
uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
# --- Main Logic ---
if uploaded_file:
st.success(f"Uploaded: {uploaded_file.name}")
full_text = extract_text_from_pdf(uploaded_file)
if full_text:
with st.expander("π View Extracted PDF Text", expanded=False):
st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
st.subheader("π¬ Ask Something")
user_question = st.text_input("Ask a question about the document")
if user_question:
with st.spinner("Analyzing..."):
try:
answer = process_question(user_question, full_text)
except Exception as e:
st.error("β οΈ Something went wrong. Try re-uploading the PDF.")
st.stop()
st.markdown("### π€ Answer")
st.write(answer)
with st.sidebar:
st.markdown("---")
st.caption("π‘ Sample Questions")
st.markdown("""
- "Summarize the document"
- "What is the experience of Pradeep Singh Sengar?"
- "What are the key points?"
- "Explain in short"
""")
else:
st.error("β Could not extract text. Try a different PDF.")
else:
st.info("Upload a PDF to get started.")
|