Spaces:

pradeepsengarr
/

Bot_RAG

Sleeping

App Files Files Community

Bot_RAG / app.py

pradeepsengarr

Update app.py

cb0ff81 verified about 1 month ago

raw

history blame

4.24 kB

	import os
	import shutil
	import tempfile
	import fitz # PyMuPDF
	import streamlit as st
	import logging

	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import Chroma
	from langchain_community.embeddings import SentenceTransformerEmbeddings
	from langchain.chains import RetrievalQA
	from langchain_community.llms import HuggingFacePipeline
	from langchain.prompts import PromptTemplate
	from langchain_community.document_loaders import TextLoader

	# --- Streamlit Config ---
	st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide")
	st.title("📚 RAG-based PDF Chatbot")

	# --- Logging ---
	logging.basicConfig(level=logging.INFO)

	# --- Load Model ---
	@st.cache_resource
	def load_model():
	checkpoint = "MBZUAI/LaMini-T5-738M"
	tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
	pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
	return HuggingFacePipeline(pipeline=pipe)

	# --- Extract PDF Text ---
	def extract_text_from_pdf(file):
	try:
	doc = fitz.open(stream=file.read(), filetype="pdf")
	return "\n".join([page.get_text() for page in doc])
	except Exception as e:
	logging.error(f"Error reading PDF: {e}")
	return ""

	# --- Create Chroma Vectorstore Safely ---
	def create_vectorstore(documents, embeddings):
	temp_dir = tempfile.mkdtemp() # unique, writable temp dir
	db = Chroma.from_documents(documents, embedding=embeddings, persist_directory=temp_dir)
	return db

	# --- Build RAG QA Chain ---
	def build_qa_chain(retriever, llm):
	prompt_template = PromptTemplate(
	input_variables=["context", "question"],
	template="""
	You are a helpful assistant. Use the context below to answer the user's question as accurately and truthfully as possible.

	Context:
	{context}

	Question:
	{question}

	Helpful Answer:
	"""
	)
	return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type_kwargs={"prompt": prompt_template})

	# --- Process QA ---
	def process_question(question, full_text):
	# Write PDF text to temp file
	with open("temp_text.txt", "w") as f:
	f.write(full_text)

	loader = TextLoader("temp_text.txt")
	docs = loader.load()

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
	chunks = text_splitter.split_documents(docs)

	embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
	vectorstore = create_vectorstore(chunks, embeddings)
	retriever = vectorstore.as_retriever()

	llm = load_model()
	qa = build_qa_chain(retriever, llm)
	return qa.run(question)

	# --- Sidebar Upload ---
	with st.sidebar:
	st.header("📄 Upload your PDF")
	uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])

	# --- Main Logic ---
	if uploaded_file:
	st.success(f"Uploaded: {uploaded_file.name}")
	full_text = extract_text_from_pdf(uploaded_file)

	if full_text:
	with st.expander("📄 View Extracted PDF Text", expanded=False):
	st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))

	st.subheader("💬 Ask Something")
	user_question = st.text_input("Ask a question about the document")

	if user_question:
	with st.spinner("Analyzing..."):
	try:
	answer = process_question(user_question, full_text)
	except Exception as e:
	st.error("⚠️ Something went wrong. Try re-uploading the PDF.")
	st.stop()
	st.markdown("### 🤖 Answer")
	st.write(answer)

	with st.sidebar:
	st.markdown("---")
	st.caption("💡 Sample Questions")
	st.markdown("""
	- "Summarize the document"
	- "What is the experience of Pradeep Singh Sengar?"
	- "What are the key points?"
	- "Explain in short"
	""")
	else:
	st.error("❌ Could not extract text. Try a different PDF.")
	else:
	st.info("Upload a PDF to get started.")