Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -422,21 +422,19 @@ import os
|
|
422 |
import streamlit as st
|
423 |
import fitz # PyMuPDF
|
424 |
import logging
|
425 |
-
import
|
|
|
426 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
427 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
428 |
from langchain_community.vectorstores import Chroma
|
429 |
from langchain_community.embeddings import SentenceTransformerEmbeddings
|
430 |
from langchain_community.llms import HuggingFacePipeline
|
431 |
from langchain.chains import RetrievalQA
|
432 |
-
from
|
433 |
-
from sentence_transformers import SentenceTransformer
|
434 |
-
from langchain_community.embeddings import HuggingFaceEmbeddings
|
435 |
|
436 |
# --- Configuration ---
|
437 |
st.set_page_config(page_title="π RAG PDF Chatbot", layout="wide")
|
438 |
st.title("π RAG-based PDF Chatbot")
|
439 |
-
persist_directory = "db"
|
440 |
device = "cpu"
|
441 |
|
442 |
# --- Logging ---
|
@@ -463,94 +461,8 @@ def read_pdf(file):
|
|
463 |
logging.error(f"Failed to extract text: {e}")
|
464 |
return ""
|
465 |
|
466 |
-
# --- Split Text into Chunks ---
|
467 |
-
def split_text_into_chunks(text):
|
468 |
-
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
469 |
-
return splitter.create_documents([text])
|
470 |
-
|
471 |
-
import os
|
472 |
-
import shutil
|
473 |
-
from sentence_transformers import SentenceTransformer
|
474 |
-
from langchain.embeddings import HuggingFaceEmbeddings
|
475 |
-
from langchain.vectorstores import Chroma
|
476 |
-
|
477 |
-
# Setup a writable directory for Chroma
|
478 |
-
chroma_dir = "/home/user/app/chroma_db" # Change this to an absolute writable directory
|
479 |
-
if os.path.exists(chroma_dir):
|
480 |
-
shutil.rmtree(chroma_dir) # Clear any old data
|
481 |
-
os.makedirs(chroma_dir, exist_ok=True)
|
482 |
-
|
483 |
-
# Initialize the model and embeddings
|
484 |
-
model = SentenceTransformer("all-MiniLM-L6-v2", device='cpu')
|
485 |
-
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
486 |
-
|
487 |
-
# Create the Chroma database
|
488 |
-
try:
|
489 |
-
db = Chroma.from_documents(splits, embeddings, persist_directory=chroma_dir)
|
490 |
-
db.persist()
|
491 |
-
print(f"Vectorstore created successfully at {chroma_dir}")
|
492 |
-
except Exception as e:
|
493 |
-
print(f"Error creating vectorstore: {e}")
|
494 |
-
|
495 |
-
|
496 |
-
# --- Setup QA Chain ---
|
497 |
-
def setup_qa(db):
|
498 |
-
retriever = db.as_retriever()
|
499 |
-
llm = load_model()
|
500 |
-
return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
|
501 |
-
|
502 |
# --- Process Answer ---
|
503 |
-
# def process_answer(question, full_text):
|
504 |
-
# # STEP 1: Chunk the PDF text
|
505 |
-
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
|
506 |
-
# docs = text_splitter.create_documents([full_text])
|
507 |
-
|
508 |
-
# # STEP 2: Create embeddings
|
509 |
-
# embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
510 |
-
# db = Chroma.from_documents(docs, embeddings)
|
511 |
-
|
512 |
-
# # STEP 3: Retrieve relevant chunks using the question
|
513 |
-
# retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})
|
514 |
-
# relevant_docs = retriever.get_relevant_documents(question)
|
515 |
-
|
516 |
-
# # STEP 4: Format the context
|
517 |
-
# context = "\n\n".join([doc.page_content for doc in relevant_docs])
|
518 |
-
|
519 |
-
# # STEP 5: Prompting
|
520 |
-
# prompt_template = """
|
521 |
-
# You are a helpful assistant that answers questions based on the context below.
|
522 |
-
|
523 |
-
# Context:
|
524 |
-
# {context}
|
525 |
-
|
526 |
-
# Question: {question}
|
527 |
-
|
528 |
-
# Answer:
|
529 |
-
# """.strip()
|
530 |
-
|
531 |
-
# prompt = prompt_template.format(context=context, question=question)
|
532 |
-
|
533 |
-
# # STEP 6: Load the model and generate response
|
534 |
-
# llm = HuggingFacePipeline.from_model_id(
|
535 |
-
# model_id="MBZUAI/LaMini-T5-738M",
|
536 |
-
# task="text2text-generation",
|
537 |
-
# model_kwargs={"temperature": 0.3, "max_length": 256},
|
538 |
-
# )
|
539 |
-
|
540 |
-
# return llm.invoke(prompt)
|
541 |
-
|
542 |
-
import tempfile
|
543 |
-
import os
|
544 |
-
|
545 |
def process_answer(question, full_text):
|
546 |
-
from langchain_community.document_loaders import TextLoader
|
547 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
548 |
-
from langchain.vectorstores import Chroma
|
549 |
-
from langchain_community.embeddings import SentenceTransformerEmbeddings
|
550 |
-
from langchain.chains import RetrievalQA
|
551 |
-
from langchain import HuggingFacePipeline
|
552 |
-
from transformers import pipeline
|
553 |
-
|
554 |
# Save the full_text to a temporary file
|
555 |
with open("temp_text.txt", "w") as f:
|
556 |
f.write(full_text)
|
@@ -568,15 +480,13 @@ def process_answer(question, full_text):
|
|
568 |
# Create a temporary directory for ChromaDB
|
569 |
chroma_dir = os.path.join(tempfile.gettempdir(), "chroma_db")
|
570 |
if os.path.exists(chroma_dir):
|
571 |
-
import shutil
|
572 |
shutil.rmtree(chroma_dir)
|
573 |
|
574 |
db = Chroma.from_documents(splits, embeddings, persist_directory=chroma_dir)
|
575 |
retriever = db.as_retriever()
|
576 |
|
577 |
# Set up the model
|
578 |
-
|
579 |
-
llm = HuggingFacePipeline(pipeline=pipe)
|
580 |
|
581 |
# RAG-style retrieval QA
|
582 |
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
|
@@ -589,12 +499,10 @@ def process_answer(question, full_text):
|
|
589 |
else:
|
590 |
return qa_chain.run(question)
|
591 |
|
592 |
-
|
593 |
# --- UI Layout ---
|
594 |
with st.sidebar:
|
595 |
st.header("π Upload PDF")
|
596 |
uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
|
597 |
-
|
598 |
|
599 |
# --- Main Interface ---
|
600 |
if uploaded_file:
|
@@ -602,7 +510,7 @@ if uploaded_file:
|
|
602 |
full_text = read_pdf(uploaded_file)
|
603 |
|
604 |
if full_text:
|
605 |
-
st.subheader("
|
606 |
with st.expander("View Extracted Text"):
|
607 |
st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
|
608 |
|
@@ -618,7 +526,7 @@ if uploaded_file:
|
|
618 |
with st.sidebar:
|
619 |
st.markdown("---")
|
620 |
st.markdown("**π‘ Suggestions:**")
|
621 |
-
st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"
|
622 |
with st.expander("π‘ Suggestions", expanded=True):
|
623 |
st.markdown("""
|
624 |
- "Summarize this document"
|
@@ -627,7 +535,6 @@ if uploaded_file:
|
|
627 |
- "Explain this document in short"
|
628 |
""")
|
629 |
|
630 |
-
|
631 |
else:
|
632 |
st.error("β οΈ No text could be extracted from the PDF. Try another file.")
|
633 |
else:
|
|
|
422 |
import streamlit as st
|
423 |
import fitz # PyMuPDF
|
424 |
import logging
|
425 |
+
import tempfile
|
426 |
+
import shutil
|
427 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
428 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
429 |
from langchain_community.vectorstores import Chroma
|
430 |
from langchain_community.embeddings import SentenceTransformerEmbeddings
|
431 |
from langchain_community.llms import HuggingFacePipeline
|
432 |
from langchain.chains import RetrievalQA
|
433 |
+
from langchain_community.document_loaders import TextLoader
|
|
|
|
|
434 |
|
435 |
# --- Configuration ---
|
436 |
st.set_page_config(page_title="π RAG PDF Chatbot", layout="wide")
|
437 |
st.title("π RAG-based PDF Chatbot")
|
|
|
438 |
device = "cpu"
|
439 |
|
440 |
# --- Logging ---
|
|
|
461 |
logging.error(f"Failed to extract text: {e}")
|
462 |
return ""
|
463 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
464 |
# --- Process Answer ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
465 |
def process_answer(question, full_text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
466 |
# Save the full_text to a temporary file
|
467 |
with open("temp_text.txt", "w") as f:
|
468 |
f.write(full_text)
|
|
|
480 |
# Create a temporary directory for ChromaDB
|
481 |
chroma_dir = os.path.join(tempfile.gettempdir(), "chroma_db")
|
482 |
if os.path.exists(chroma_dir):
|
|
|
483 |
shutil.rmtree(chroma_dir)
|
484 |
|
485 |
db = Chroma.from_documents(splits, embeddings, persist_directory=chroma_dir)
|
486 |
retriever = db.as_retriever()
|
487 |
|
488 |
# Set up the model
|
489 |
+
llm = load_model()
|
|
|
490 |
|
491 |
# RAG-style retrieval QA
|
492 |
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
|
|
|
499 |
else:
|
500 |
return qa_chain.run(question)
|
501 |
|
|
|
502 |
# --- UI Layout ---
|
503 |
with st.sidebar:
|
504 |
st.header("π Upload PDF")
|
505 |
uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
|
|
|
506 |
|
507 |
# --- Main Interface ---
|
508 |
if uploaded_file:
|
|
|
510 |
full_text = read_pdf(uploaded_file)
|
511 |
|
512 |
if full_text:
|
513 |
+
st.subheader("π PDF Preview")
|
514 |
with st.expander("View Extracted Text"):
|
515 |
st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
|
516 |
|
|
|
526 |
with st.sidebar:
|
527 |
st.markdown("---")
|
528 |
st.markdown("**π‘ Suggestions:**")
|
529 |
+
st.caption("Try: \"Summarize this document\" or \"What is the key idea?\")
|
530 |
with st.expander("π‘ Suggestions", expanded=True):
|
531 |
st.markdown("""
|
532 |
- "Summarize this document"
|
|
|
535 |
- "Explain this document in short"
|
536 |
""")
|
537 |
|
|
|
538 |
else:
|
539 |
st.error("β οΈ No text could be extracted from the PDF. Try another file.")
|
540 |
else:
|