Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -420,9 +420,9 @@
|
|
420 |
import os
|
421 |
import streamlit as st
|
422 |
import fitz # PyMuPDF
|
423 |
-
import logging
|
424 |
import tempfile
|
425 |
import shutil
|
|
|
426 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
427 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
428 |
from langchain_community.vectorstores import Chroma
|
@@ -430,132 +430,125 @@ from langchain_community.embeddings import SentenceTransformerEmbeddings
|
|
430 |
from langchain_community.llms import HuggingFacePipeline
|
431 |
from langchain.chains import RetrievalQA
|
432 |
from langchain_community.document_loaders import TextLoader
|
|
|
433 |
|
434 |
-
# ---
|
435 |
st.set_page_config(page_title="π RAG PDF Chatbot", layout="wide")
|
436 |
st.title("π RAG-based PDF Chatbot")
|
437 |
-
device = "cpu"
|
438 |
|
439 |
# --- Logging ---
|
440 |
logging.basicConfig(level=logging.INFO)
|
441 |
|
442 |
-
# --- Load LLM ---
|
443 |
@st.cache_resource
|
444 |
-
def
|
445 |
checkpoint = "MBZUAI/LaMini-T5-738M"
|
446 |
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
447 |
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
|
448 |
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
|
449 |
return HuggingFacePipeline(pipeline=pipe)
|
450 |
|
451 |
-
# ---
|
452 |
-
def
|
453 |
try:
|
454 |
doc = fitz.open(stream=file.read(), filetype="pdf")
|
455 |
-
|
456 |
-
# Extract text from each page
|
457 |
for page in doc:
|
458 |
-
|
459 |
-
return
|
460 |
except Exception as e:
|
461 |
-
logging.error(f"
|
462 |
return ""
|
463 |
|
464 |
-
# --- Build
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
loader = TextLoader("temp_text.txt")
|
472 |
-
docs = loader.load()
|
473 |
-
|
474 |
-
# Chunking
|
475 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=300)
|
476 |
-
splits = text_splitter.split_documents(docs)
|
477 |
-
|
478 |
-
# Embeddings
|
479 |
-
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
480 |
-
|
481 |
-
# Safe temporary directory for Chroma
|
482 |
-
chroma_dir = os.path.join(tempfile.gettempdir(), "chroma_db_rag")
|
483 |
-
if os.path.exists(chroma_dir):
|
484 |
-
shutil.rmtree(chroma_dir)
|
485 |
-
os.makedirs(chroma_dir, exist_ok=True)
|
486 |
|
487 |
-
|
|
|
|
|
488 |
db.persist()
|
489 |
-
return db
|
490 |
-
|
491 |
-
# ---
|
492 |
-
def
|
493 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
494 |
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
|
|
|
|
|
|
500 |
|
501 |
-
|
502 |
-
# Let's modify how we ask the model to answer
|
503 |
-
prompt = f"""
|
504 |
-
Given the following text, answer the question with a simple and direct 'Yes' or 'No' followed by a brief explanation.
|
505 |
|
506 |
-
|
507 |
-
|
508 |
-
Question: {question}
|
509 |
-
Answer:
|
510 |
-
"""
|
511 |
|
512 |
-
|
513 |
-
|
514 |
-
|
|
|
|
|
515 |
|
516 |
-
return
|
517 |
|
518 |
-
# --- UI
|
519 |
with st.sidebar:
|
520 |
st.header("π Upload PDF")
|
521 |
-
uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
|
522 |
|
523 |
-
# --- Main Interface ---
|
524 |
if uploaded_file:
|
525 |
-
st.success(f"
|
526 |
-
full_text =
|
527 |
|
528 |
if full_text:
|
529 |
st.subheader("π PDF Preview")
|
530 |
-
with st.expander("View Extracted Text"):
|
531 |
st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
|
532 |
|
533 |
-
st.subheader("π¬ Ask
|
534 |
-
user_question = st.text_input("
|
535 |
-
|
536 |
-
# Build retriever once per session
|
537 |
-
retriever = build_retriever(full_text)
|
538 |
|
539 |
if user_question:
|
540 |
-
with st.spinner("
|
541 |
-
answer =
|
542 |
st.markdown("### π€ Answer")
|
543 |
st.write(answer)
|
544 |
|
545 |
with st.sidebar:
|
546 |
st.markdown("---")
|
547 |
st.markdown("**π‘ Suggestions:**")
|
548 |
-
st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
|
549 |
-
with st.expander("π‘ Suggestions", expanded=True):
|
550 |
st.markdown("""
|
551 |
- "Summarize this document"
|
552 |
-
- "
|
553 |
-
- "What
|
554 |
-
- "
|
555 |
""")
|
556 |
-
|
557 |
else:
|
558 |
-
st.error("
|
559 |
else:
|
560 |
st.info("Upload a PDF to begin.")
|
561 |
|
|
|
|
420 |
import os
|
421 |
import streamlit as st
|
422 |
import fitz # PyMuPDF
|
|
|
423 |
import tempfile
|
424 |
import shutil
|
425 |
+
import logging
|
426 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
427 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
428 |
from langchain_community.vectorstores import Chroma
|
|
|
430 |
from langchain_community.llms import HuggingFacePipeline
|
431 |
from langchain.chains import RetrievalQA
|
432 |
from langchain_community.document_loaders import TextLoader
|
433 |
+
from langchain.docstore.document import Document
|
434 |
|
435 |
+
# --- Streamlit Config ---
|
436 |
st.set_page_config(page_title="π RAG PDF Chatbot", layout="wide")
|
437 |
st.title("π RAG-based PDF Chatbot")
|
|
|
438 |
|
439 |
# --- Logging ---
|
440 |
logging.basicConfig(level=logging.INFO)
|
441 |
|
442 |
+
# --- Load LLM Model ---
|
443 |
@st.cache_resource
|
444 |
+
def load_llm():
|
445 |
checkpoint = "MBZUAI/LaMini-T5-738M"
|
446 |
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
447 |
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
|
448 |
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
|
449 |
return HuggingFacePipeline(pipeline=pipe)
|
450 |
|
451 |
+
# --- PDF Text Extraction ---
|
452 |
+
def extract_text_from_pdf(file):
|
453 |
try:
|
454 |
doc = fitz.open(stream=file.read(), filetype="pdf")
|
455 |
+
full_text = ""
|
|
|
456 |
for page in doc:
|
457 |
+
full_text += page.get_text()
|
458 |
+
return full_text.strip()
|
459 |
except Exception as e:
|
460 |
+
logging.error(f"Error reading PDF: {e}")
|
461 |
return ""
|
462 |
|
463 |
+
# --- Build Vectorstore ---
|
464 |
+
def create_vectorstore(text_chunks, embeddings):
|
465 |
+
temp_dir = os.path.join(tempfile.gettempdir(), "chroma_db")
|
466 |
+
if os.path.exists(temp_dir):
|
467 |
+
shutil.rmtree(temp_dir)
|
468 |
+
os.makedirs(temp_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
469 |
|
470 |
+
# Wrap each chunk in a Document object
|
471 |
+
documents = [Document(page_content=chunk) for chunk in text_chunks]
|
472 |
+
db = Chroma.from_documents(documents, embedding=embeddings, persist_directory=temp_dir)
|
473 |
db.persist()
|
474 |
+
return db
|
475 |
+
|
476 |
+
# --- Smart Chunking ---
|
477 |
+
def chunk_text(full_text):
|
478 |
+
splitter = RecursiveCharacterTextSplitter(
|
479 |
+
chunk_size=1000,
|
480 |
+
chunk_overlap=150,
|
481 |
+
separators=["\n\n", "\n", ".", "!", "?", " ", ""]
|
482 |
+
)
|
483 |
+
return splitter.split_text(full_text)
|
484 |
+
|
485 |
+
# --- Answering Logic ---
|
486 |
+
def process_question(question, full_text):
|
487 |
+
if not full_text:
|
488 |
+
return "No valid text extracted from PDF."
|
489 |
+
|
490 |
+
text_chunks = chunk_text(full_text)
|
491 |
+
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
492 |
+
vectorstore = create_vectorstore(text_chunks, embeddings)
|
493 |
+
retriever = vectorstore.as_retriever()
|
494 |
|
495 |
+
llm = load_llm()
|
496 |
+
qa = RetrievalQA.from_chain_type(
|
497 |
+
llm=llm,
|
498 |
+
retriever=retriever,
|
499 |
+
chain_type="stuff",
|
500 |
+
return_source_documents=False,
|
501 |
+
chain_type_kwargs={
|
502 |
+
"prompt": f"""You are a helpful assistant. Answer the user's question based only on the provided document content.
|
503 |
|
504 |
+
If the answer is clearly stated in the document, respond accurately and directly.
|
|
|
|
|
|
|
505 |
|
506 |
+
If not, say "The document does not provide enough information." Do not make things up.
|
|
|
|
|
|
|
|
|
507 |
|
508 |
+
Question: {question}
|
509 |
+
Context: {{context}}
|
510 |
+
Answer:"""
|
511 |
+
}
|
512 |
+
)
|
513 |
|
514 |
+
return qa.run(question)
|
515 |
|
516 |
+
# --- Streamlit UI ---
|
517 |
with st.sidebar:
|
518 |
st.header("π Upload PDF")
|
519 |
+
uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
|
520 |
|
|
|
521 |
if uploaded_file:
|
522 |
+
st.success(f"Uploaded: {uploaded_file.name}")
|
523 |
+
full_text = extract_text_from_pdf(uploaded_file)
|
524 |
|
525 |
if full_text:
|
526 |
st.subheader("π PDF Preview")
|
527 |
+
with st.expander("π View Extracted Text"):
|
528 |
st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
|
529 |
|
530 |
+
st.subheader("π¬ Ask your question")
|
531 |
+
user_question = st.text_input("Enter your question about the PDF")
|
|
|
|
|
|
|
532 |
|
533 |
if user_question:
|
534 |
+
with st.spinner("π€ Generating Answer..."):
|
535 |
+
answer = process_question(user_question, full_text)
|
536 |
st.markdown("### π€ Answer")
|
537 |
st.write(answer)
|
538 |
|
539 |
with st.sidebar:
|
540 |
st.markdown("---")
|
541 |
st.markdown("**π‘ Suggestions:**")
|
542 |
+
st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
|
|
|
543 |
st.markdown("""
|
544 |
- "Summarize this document"
|
545 |
+
- "What is the background of Pradeep Singh Sengar?"
|
546 |
+
- "What experience does he have?"
|
547 |
+
- "List key skills mentioned in the document."
|
548 |
""")
|
|
|
549 |
else:
|
550 |
+
st.error("β No extractable text found in this PDF. Try another file.")
|
551 |
else:
|
552 |
st.info("Upload a PDF to begin.")
|
553 |
|
554 |
+
|