Spaces:
Sleeping
Sleeping
File size: 4,222 Bytes
709f6b7 cb0ff81 528bb27 0b64652 709f6b7 0b64652 cb0ff81 0b64652 6b52351 0b64652 3875c87 0b64652 709f6b7 0b64652 bbd8a88 0b64652 bbd8a88 0b64652 bbd8a88 0b64652 3875c87 0b64652 528bb27 0b64652 3875c87 0b64652 bbd8a88 6956d92 0b64652 6b52351 0b64652 920b3d6 0b64652 cb0ff81 0b64652 cb0ff81 0b64652 528bb27 0b64652 528bb27 cb0ff81 0b64652 528bb27 7c797e6 0b64652 bbd8a88 528bb27 0b64652 bbd8a88 0b64652 bbd8a88 0b64652 bbd8a88 0b64652 dea11f3 0b64652 bbd8a88 0b64652 bbd8a88 0b64652 3875c87 0b64652 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import os
import streamlit as st
import fitz # PyMuPDF
import logging
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import TextLoader
# --- Configuration ---
st.set_page_config(page_title="π RAG PDF Chatbot", layout="wide")
st.title("π RAG-based PDF Chatbot")
device = "cpu"
# --- Logging ---
logging.basicConfig(level=logging.INFO)
# --- Load LLM ---
@st.cache_resource
def load_model():
checkpoint = "MBZUAI/LaMini-T5-738M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95)
return HuggingFacePipeline(pipeline=pipe)
# --- Extract PDF Text ---
def read_pdf(file):
try:
doc = fitz.open(stream=file.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text()
return text.strip()
except Exception as e:
logging.error(f"Failed to extract text: {e}")
return ""
# --- Process Answer ---
def process_answer(question, full_text):
# Save the full_text to a temporary file
with open("temp_text.txt", "w") as f:
f.write(full_text)
loader = TextLoader("temp_text.txt")
docs = loader.load()
# Chunk the documents with increased size and overlap
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=300)
splits = text_splitter.split_documents(docs)
# Load embeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# Create Chroma in-memory vector store
db = Chroma.from_documents(splits, embedding=embeddings)
retriever = db.as_retriever()
# Set up the model
llm = load_model()
# Create a custom prompt
prompt_template = PromptTemplate.from_template("""
You are a helpful assistant. Use the following context to answer the question as accurately and thoroughly as possible.
Context: {context}
Question: {question}
Answer in detail:""")
# Retrieval QA with custom prompt
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
retriever=retriever,
chain_type="stuff",
chain_type_kwargs={"prompt": prompt_template}
)
# Return the answer using the retrieval QA chain
return qa_chain.run(question)
# --- UI Layout ---
with st.sidebar:
st.header("π Upload PDF")
uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
# --- Main Interface ---
if uploaded_file:
st.success(f"You uploaded: {uploaded_file.name}")
full_text = read_pdf(uploaded_file)
if full_text:
st.subheader("π PDF Preview")
with st.expander("View Extracted Text"):
st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
st.subheader("π¬ Ask a Question")
user_question = st.text_input("Type your question about the PDF content")
if user_question:
with st.spinner("Thinking..."):
answer = process_answer(user_question, full_text)
st.markdown("### π€ Answer")
st.write(answer)
with st.sidebar:
st.markdown("---")
st.markdown("**π‘ Suggestions:**")
st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
with st.expander("π‘ Suggestions", expanded=True):
st.markdown("""
- "Summarize this document"
- "Give a quick summary"
- "What are the main points?"
- "Explain this document in short"
""")
else:
st.error("β οΈ No text could be extracted from the PDF. Try another file.")
else:
st.info("Upload a PDF to begin.")
|