Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -483,50 +483,103 @@ def setup_qa(db):
|
|
483 |
return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
|
484 |
|
485 |
# --- Process Answer ---
|
486 |
-
def process_answer(question, full_text):
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
|
491 |
-
|
492 |
-
|
493 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
494 |
|
495 |
-
|
496 |
-
|
497 |
-
relevant_docs = retriever.get_relevant_documents(question)
|
498 |
|
499 |
-
|
500 |
-
context = "\n\n".join([doc.page_content for doc in relevant_docs])
|
501 |
|
502 |
-
|
503 |
-
|
504 |
-
|
|
|
|
|
|
|
|
|
|
|
505 |
|
506 |
-
|
507 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
508 |
|
509 |
-
|
|
|
|
|
510 |
|
511 |
-
|
512 |
-
|
513 |
|
514 |
-
|
|
|
|
|
515 |
|
516 |
-
#
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
)
|
522 |
|
523 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
524 |
|
525 |
|
526 |
# --- UI Layout ---
|
527 |
with st.sidebar:
|
528 |
st.header("π Upload PDF")
|
529 |
uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
|
|
|
530 |
|
531 |
# --- Main Interface ---
|
532 |
if uploaded_file:
|
@@ -551,6 +604,14 @@ if uploaded_file:
|
|
551 |
st.markdown("---")
|
552 |
st.markdown("**π‘ Suggestions:**")
|
553 |
st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
554 |
|
555 |
else:
|
556 |
st.error("β οΈ No text could be extracted from the PDF. Try another file.")
|
|
|
483 |
return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
|
484 |
|
485 |
# --- Process Answer ---
|
486 |
+
# def process_answer(question, full_text):
|
487 |
+
# # STEP 1: Chunk the PDF text
|
488 |
+
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
|
489 |
+
# docs = text_splitter.create_documents([full_text])
|
490 |
|
491 |
+
# # STEP 2: Create embeddings
|
492 |
+
# embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
493 |
+
# db = Chroma.from_documents(docs, embeddings)
|
494 |
+
|
495 |
+
# # STEP 3: Retrieve relevant chunks using the question
|
496 |
+
# retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})
|
497 |
+
# relevant_docs = retriever.get_relevant_documents(question)
|
498 |
+
|
499 |
+
# # STEP 4: Format the context
|
500 |
+
# context = "\n\n".join([doc.page_content for doc in relevant_docs])
|
501 |
+
|
502 |
+
# # STEP 5: Prompting
|
503 |
+
# prompt_template = """
|
504 |
+
# You are a helpful assistant that answers questions based on the context below.
|
505 |
+
|
506 |
+
# Context:
|
507 |
+
# {context}
|
508 |
+
|
509 |
+
# Question: {question}
|
510 |
|
511 |
+
# Answer:
|
512 |
+
# """.strip()
|
|
|
513 |
|
514 |
+
# prompt = prompt_template.format(context=context, question=question)
|
|
|
515 |
|
516 |
+
# # STEP 6: Load the model and generate response
|
517 |
+
# llm = HuggingFacePipeline.from_model_id(
|
518 |
+
# model_id="MBZUAI/LaMini-T5-738M",
|
519 |
+
# task="text2text-generation",
|
520 |
+
# model_kwargs={"temperature": 0.3, "max_length": 256},
|
521 |
+
# )
|
522 |
+
|
523 |
+
# return llm.invoke(prompt)
|
524 |
|
525 |
+
def process_answer(question, full_text):
|
526 |
+
from langchain_community.document_loaders import TextLoader
|
527 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
528 |
+
from langchain.vectorstores import Chroma
|
529 |
+
from langchain_community.embeddings import SentenceTransformerEmbeddings
|
530 |
+
from langchain.chains import RetrievalQA
|
531 |
+
from langchain import HuggingFacePipeline
|
532 |
+
from transformers import pipeline
|
533 |
+
import os
|
534 |
+
import shutil
|
535 |
+
|
536 |
+
# Save to temp file and load it as document
|
537 |
+
with open("temp_text.txt", "w") as f:
|
538 |
+
f.write(full_text)
|
539 |
+
|
540 |
+
loader = TextLoader("temp_text.txt")
|
541 |
+
docs = loader.load()
|
542 |
+
|
543 |
+
# Chunking the docs
|
544 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
|
545 |
+
splits = text_splitter.split_documents(docs)
|
546 |
+
|
547 |
+
# Embeddings
|
548 |
+
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
549 |
|
550 |
+
# Clean up old DB if exists
|
551 |
+
if os.path.exists("chroma_db"):
|
552 |
+
shutil.rmtree("chroma_db")
|
553 |
|
554 |
+
db = Chroma.from_documents(splits, embeddings, persist_directory="chroma_db")
|
555 |
+
retriever = db.as_retriever()
|
556 |
|
557 |
+
# Model pipeline
|
558 |
+
pipe = pipeline("text2text-generation", model="MBZUAI/LaMini-T5-738M", max_length=512)
|
559 |
+
llm = HuggingFacePipeline(pipeline=pipe)
|
560 |
|
561 |
+
# Retrieval QA chain
|
562 |
+
qa_chain = RetrievalQA.from_chain_type(
|
563 |
+
llm=llm,
|
564 |
+
retriever=retriever,
|
565 |
+
return_source_documents=False
|
566 |
)
|
567 |
|
568 |
+
# Check if question is about summarization
|
569 |
+
if "summarize" in question.lower() or "summary" in question.lower() or "tl;dr" in question.lower():
|
570 |
+
prompt = f"Summarize the following document:\n\n{full_text[:3000]}" # trimming to 3K chars for model
|
571 |
+
summary = llm(prompt)
|
572 |
+
return summary
|
573 |
+
else:
|
574 |
+
answer = qa_chain.run(question)
|
575 |
+
return answer
|
576 |
|
577 |
|
578 |
# --- UI Layout ---
|
579 |
with st.sidebar:
|
580 |
st.header("π Upload PDF")
|
581 |
uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
|
582 |
+
|
583 |
|
584 |
# --- Main Interface ---
|
585 |
if uploaded_file:
|
|
|
604 |
st.markdown("---")
|
605 |
st.markdown("**π‘ Suggestions:**")
|
606 |
st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
|
607 |
+
with st.expander("π‘ Suggestions", expanded=True):
|
608 |
+
st.markdown("""
|
609 |
+
- "Summarize this document"
|
610 |
+
- "Give a quick summary"
|
611 |
+
- "What are the main points?"
|
612 |
+
- "Explain this document in short"
|
613 |
+
""")
|
614 |
+
|
615 |
|
616 |
else:
|
617 |
st.error("β οΈ No text could be extracted from the PDF. Try another file.")
|