Spaces:

pradeepsengarr
/

Bot_RAG

Sleeping

App Files Files Community

Bot_RAG / app.py

pradeepsengarr

Update app.py

7afdcd2 verified about 2 months ago

raw

history blame

7.59 kB

	# import os
	# import streamlit as st
	# import fitz # PyMuPDF
	# import logging
	# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
	# from langchain.text_splitter import RecursiveCharacterTextSplitter
	# from langchain_community.vectorstores import Chroma
	# from langchain_community.embeddings import SentenceTransformerEmbeddings
	# from langchain_community.llms import HuggingFacePipeline
	# from langchain.chains import RetrievalQA
	# from langchain.prompts import PromptTemplate
	# from langchain_community.document_loaders import TextLoader

	# # --- Configuration ---
	# st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide")
	# st.title("📚 RAG-based PDF Chatbot")
	# device = "cpu"

	# # --- Logging ---
	# logging.basicConfig(level=logging.INFO)

	# # --- Load LLM ---
	# @st.cache_resource
	# def load_model():
	# checkpoint = "MBZUAI/LaMini-T5-738M"
	# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	# model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
	# pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95)
	# return HuggingFacePipeline(pipeline=pipe)

	# # --- Extract PDF Text ---
	# def read_pdf(file):
	# try:
	# doc = fitz.open(stream=file.read(), filetype="pdf")
	# text = ""
	# for page in doc:
	# text += page.get_text()
	# return text.strip()
	# except Exception as e:
	# logging.error(f"Failed to extract text: {e}")
	# return ""

	# # --- Process Answer ---dd
	# def process_answer(question, full_text):
	# # Save the full_text to a temporary file
	# with open("temp_text.txt", "w") as f:
	# f.write(full_text)

	# loader = TextLoader("temp_text.txt")
	# docs = loader.load()

	# # Chunk the documents with increased size and overlap
	# text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=300)
	# splits = text_splitter.split_documents(docs)

	# # Load embeddings
	# embeddings = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5")

	# # Create Chroma in-memory vector store
	# db = Chroma.from_documents(splits, embedding=embeddings)
	# retriever = db.as_retriever()

	# # Set up the model
	# llm = load_model()

	# # Create a custom prompt
	# prompt_template = PromptTemplate(
	# input_variables=["context", "question"],
	# template="""
	# You are a helpful assistant. Carefully analyze the given context and extract direct answers ONLY from it.

	# Context:
	# {context}

	# Question:
	# {question}

	# Important Instructions:
	# - If the question asks for a URL (e.g., LinkedIn link), provide the exact URL as it appears.
	# - Do NOT summarize or paraphrase.
	# - If the information is not in the context, say "Not found in the document."

	# Answer:
	# """)


	# # Retrieval QA with custom prompt
	# qa_chain = RetrievalQA.from_chain_type(
	# llm=llm,
	# retriever=retriever,
	# chain_type="stuff",
	# chain_type_kwargs={"prompt": prompt_template}
	# )

	# # Return the answer using the retrieval QA chain
	# return qa_chain.run(question)

	# # --- UI Layout ---
	# with st.sidebar:
	# st.header("📄 Upload PDF")
	# uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])

	# # --- Main Interface ---
	# if uploaded_file:
	# st.success(f"You uploaded: {uploaded_file.name}")
	# full_text = read_pdf(uploaded_file)

	# if full_text:
	# st.subheader("📁 PDF Preview")
	# with st.expander("View Extracted Text"):
	# st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))

	# st.subheader("💬 Ask a Question")
	# user_question = st.text_input("Type your question about the PDF content")

	# if user_question:
	# with st.spinner("Thinking..."):
	# answer = process_answer(user_question, full_text)
	# st.markdown("### 🤖 Answer")
	# st.write(answer)

	# with st.sidebar:
	# st.markdown("---")
	# st.markdown("💡 Suggestions:")
	# st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
	# with st.expander("💡 Suggestions", expanded=True):
	# st.markdown("""
	# - "Summarize this document"
	# - "Give a quick summary"
	# - "What are the main points?"
	# - "Explain this document in short"
	# """)

	# else:
	# st.error("⚠️ No text could be extracted from the PDF. Try another file.")
	# else:
	# st.info("Upload a PDF to begin.")


	import os
	import streamlit as st
	from langchain_community.document_loaders import PyPDFLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain.chains import RetrievalQA
	from langchain.prompts import PromptTemplate
	from langchain.llms import HuggingFaceHub

	# Set your Hugging Face API token here
	os.environ["HUGGINGFACEHUB_API_TOKEN"] = "your_hf_token_here"

	# Load and split PDF
	def load_and_split_pdf(uploaded_file):
	with open("temp.pdf", "wb") as f:
	f.write(uploaded_file.read())
	loader = PyPDFLoader("temp.pdf")
	documents = loader.load()

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
	chunks = text_splitter.split_documents(documents)
	return chunks

	# Build vectorstore
	def build_vectorstore(chunks):
	embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	vectorstore = FAISS.from_documents(chunks, embedding=embedding_model)
	return vectorstore

	# Load Lamini or other HF model
	def get_llm():
	return HuggingFaceHub(
	repo_id="lamini/lamini-13b-chat",
	model_kwargs={"temperature": 0.2, "max_new_tokens": 512}
	)

	# Create prompt template (optional for better accuracy)
	custom_prompt = PromptTemplate(
	input_variables=["context", "question"],
	template="""
	You are a helpful assistant. Use the following context to answer the question as accurately as possible.
	If the answer is not in the context, respond with "Not found in the document."

	Context:
	{context}

	Question: {question}

	Answer:"""
	)

	# Build QA chain
	def build_qa_chain(vectorstore):
	llm = get_llm()
	qa_chain = RetrievalQA.from_chain_type(
	llm=llm,
	retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
	chain_type_kwargs={"prompt": custom_prompt}
	)
	return qa_chain

	# Streamlit UI
	def main():
	st.set_page_config(page_title="PDF Chatbot", layout="wide")
	st.title("Chat with your PDF")

	uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])

	if uploaded_file:
	st.success("PDF uploaded successfully!")
	with st.spinner("Processing PDF..."):
	chunks = load_and_split_pdf(uploaded_file)
	vectorstore = build_vectorstore(chunks)
	qa_chain = build_qa_chain(vectorstore)
	st.success("Ready to chat!")

	user_question = st.text_input("Ask a question based on the PDF:")
	if user_question:
	with st.spinner("Generating answer..."):
	result = qa_chain.run(user_question)
	st.markdown("Answer:")
	st.write(result)

	if __name__ == "__main__":
	main()