import os import streamlit as st import fitz # PyMuPDF import logging from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma from langchain_community.embeddings import SentenceTransformerEmbeddings from langchain_community.llms import HuggingFacePipeline from langchain.chains import RetrievalQA from langchain.prompts import PromptTemplate from langchain_community.document_loaders import TextLoader # --- Configuration --- st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide") st.title("📚 RAG-based PDF Chatbot") device = "cpu" # --- Logging --- logging.basicConfig(level=logging.INFO) # --- Load LLM --- @st.cache_resource def load_model(): checkpoint = "MBZUAI/LaMini-T5-738M" tokenizer = AutoTokenizer.from_pretrained(checkpoint) model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95) return HuggingFacePipeline(pipeline=pipe) # --- Extract PDF Text --- def read_pdf(file): try: doc = fitz.open(stream=file.read(), filetype="pdf") text = "" for page in doc: text += page.get_text() return text.strip() except Exception as e: logging.error(f"Failed to extract text: {e}") return "" # --- Process Answer ---dd def process_answer(question, full_text): # Save the full_text to a temporary file with open("temp_text.txt", "w") as f: f.write(full_text) loader = TextLoader("temp_text.txt") docs = loader.load() # Chunk the documents with increased size and overlap text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=300) splits = text_splitter.split_documents(docs) # Load embeddings embeddings = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5") # Create Chroma in-memory vector store db = Chroma.from_documents(splits, embedding=embeddings) retriever = db.as_retriever() # Set up the model llm = load_model() # Create a custom prompt prompt_template = PromptTemplate( input_variables=["context", "question"], template=""" You are a helpful assistant. Carefully analyze the given context and extract direct answers ONLY from it. Context: {context} Question: {question} Important Instructions: - If the question asks for a URL (e.g., LinkedIn link), provide the exact URL as it appears. - Do NOT summarize or paraphrase. - If the information is not in the context, say "Not found in the document." Answer: """) # Retrieval QA with custom prompt qa_chain = RetrievalQA.from_chain_type( llm=llm, retriever=retriever, chain_type="stuff", chain_type_kwargs={"prompt": prompt_template} ) # Return the answer using the retrieval QA chain return qa_chain.run(question) # --- UI Layout --- with st.sidebar: st.header("📄 Upload PDF") uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"]) # --- Main Interface --- if uploaded_file: st.success(f"You uploaded: {uploaded_file.name}") full_text = read_pdf(uploaded_file) if full_text: st.subheader("📁 PDF Preview") with st.expander("View Extracted Text"): st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else "")) st.subheader("💬 Ask a Question") user_question = st.text_input("Type your question about the PDF content") if user_question: with st.spinner("Thinking..."): answer = process_answer(user_question, full_text) st.markdown("### 🤖 Answer") st.write(answer) with st.sidebar: st.markdown("---") st.markdown("**💡 Suggestions:**") st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"") with st.expander("💡 Suggestions", expanded=True): st.markdown(""" - "Summarize this document" - "Give a quick summary" - "What are the main points?" - "Explain this document in short" """) else: st.error("⚠️ No text could be extracted from the PDF. Try another file.") else: st.info("Upload a PDF to begin.") # import os # import streamlit as st # from langchain_community.document_loaders import PyPDFLoader # from langchain_text_splitters import RecursiveCharacterTextSplitter # from langchain_community.embeddings import HuggingFaceEmbeddings # from langchain_community.vectorstores import FAISS # from langchain.chains import RetrievalQA # from langchain.prompts import PromptTemplate # from langchain.llms import HuggingFaceHub # # Set your Hugging Face API token here # os.environ["HUGGINGFACEHUB_API_TOKEN"] = "your_hf_token_here" # # Load and split PDF # def load_and_split_pdf(uploaded_file): # with open("temp.pdf", "wb") as f: # f.write(uploaded_file.read()) # loader = PyPDFLoader("temp.pdf") # documents = loader.load() # text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100) # chunks = text_splitter.split_documents(documents) # return chunks # # Build vectorstore # def build_vectorstore(chunks): # embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # vectorstore = FAISS.from_documents(chunks, embedding=embedding_model) # return vectorstore # # Load Lamini or other HF model # def get_llm(): # return HuggingFaceHub( # repo_id="lamini/lamini-13b-chat", # model_kwargs={"temperature": 0.2, "max_new_tokens": 512} # ) # # Create prompt template (optional for better accuracy) # custom_prompt = PromptTemplate( # input_variables=["context", "question"], # template=""" # You are a helpful assistant. Use the following context to answer the question as accurately as possible. # If the answer is not in the context, respond with "Not found in the document." # Context: # {context} # Question: {question} # Answer:""" # ) # # Build QA chain # def build_qa_chain(vectorstore): # llm = get_llm() # qa_chain = RetrievalQA.from_chain_type( # llm=llm, # retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5}), # chain_type_kwargs={"prompt": custom_prompt} # ) # return qa_chain # # Streamlit UI # def main(): # st.set_page_config(page_title="PDF Chatbot", layout="wide") # st.title("Chat with your PDF") # uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"]) # if uploaded_file: # st.success("PDF uploaded successfully!") # with st.spinner("Processing PDF..."): # chunks = load_and_split_pdf(uploaded_file) # vectorstore = build_vectorstore(chunks) # qa_chain = build_qa_chain(vectorstore) # st.success("Ready to chat!") # user_question = st.text_input("Ask a question based on the PDF:") # if user_question: # with st.spinner("Generating answer..."): # result = qa_chain.run(user_question) # st.markdown("**Answer:**") # st.write(result) # if __name__ == "__main__": # main()