Spaces:

pradeepsengarr
/

Bot_RAG

Sleeping

App Files Files Community

Bot_RAG / app.py

pradeepsengarr

Update app.py

0b64652 verified about 1 month ago

raw

history blame

4.22 kB

	import os
	import streamlit as st
	import fitz # PyMuPDF
	import logging
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import Chroma
	from langchain_community.embeddings import SentenceTransformerEmbeddings
	from langchain_community.llms import HuggingFacePipeline
	from langchain.chains import RetrievalQA
	from langchain.prompts import PromptTemplate
	from langchain_community.document_loaders import TextLoader

	# --- Configuration ---
	st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide")
	st.title("📚 RAG-based PDF Chatbot")
	device = "cpu"

	# --- Logging ---
	logging.basicConfig(level=logging.INFO)

	# --- Load LLM ---
	@st.cache_resource
	def load_model():
	checkpoint = "MBZUAI/LaMini-T5-738M"
	tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
	pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95)
	return HuggingFacePipeline(pipeline=pipe)

	# --- Extract PDF Text ---
	def read_pdf(file):
	try:
	doc = fitz.open(stream=file.read(), filetype="pdf")
	text = ""
	for page in doc:
	text += page.get_text()
	return text.strip()
	except Exception as e:
	logging.error(f"Failed to extract text: {e}")
	return ""

	# --- Process Answer ---
	def process_answer(question, full_text):
	# Save the full_text to a temporary file
	with open("temp_text.txt", "w") as f:
	f.write(full_text)

	loader = TextLoader("temp_text.txt")
	docs = loader.load()

	# Chunk the documents with increased size and overlap
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=300)
	splits = text_splitter.split_documents(docs)

	# Load embeddings
	embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

	# Create Chroma in-memory vector store
	db = Chroma.from_documents(splits, embedding=embeddings)
	retriever = db.as_retriever()

	# Set up the model
	llm = load_model()

	# Create a custom prompt
	prompt_template = PromptTemplate.from_template("""
	You are a helpful assistant. Use the following context to answer the question as accurately and thoroughly as possible.

	Context: {context}

	Question: {question}

	Answer in detail:""")

	# Retrieval QA with custom prompt
	qa_chain = RetrievalQA.from_chain_type(
	llm=llm,
	retriever=retriever,
	chain_type="stuff",
	chain_type_kwargs={"prompt": prompt_template}
	)

	# Return the answer using the retrieval QA chain
	return qa_chain.run(question)

	# --- UI Layout ---
	with st.sidebar:
	st.header("📄 Upload PDF")
	uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])

	# --- Main Interface ---
	if uploaded_file:
	st.success(f"You uploaded: {uploaded_file.name}")
	full_text = read_pdf(uploaded_file)

	if full_text:
	st.subheader("📁 PDF Preview")
	with st.expander("View Extracted Text"):
	st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))

	st.subheader("💬 Ask a Question")
	user_question = st.text_input("Type your question about the PDF content")

	if user_question:
	with st.spinner("Thinking..."):
	answer = process_answer(user_question, full_text)
	st.markdown("### 🤖 Answer")
	st.write(answer)

	with st.sidebar:
	st.markdown("---")
	st.markdown("💡 Suggestions:")
	st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
	with st.expander("💡 Suggestions", expanded=True):
	st.markdown("""
	- "Summarize this document"
	- "Give a quick summary"
	- "What are the main points?"
	- "Explain this document in short"
	""")

	else:
	st.error("⚠️ No text could be extracted from the PDF. Try another file.")
	else:
	st.info("Upload a PDF to begin.")