Spaces:

zliang
/

PDFReadingAssistant

Paused

File size: 15,307 Bytes

import os
import time
import io
import base64
import re
import numpy as np
import fitz  # PyMuPDF
import tempfile
from PIL import Image
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from ultralytics import YOLO
import streamlit as st
from streamlit_chat import message
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import SpacyTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from streamlit.runtime.scriptrunner import get_script_run_ctx
from streamlit import runtime

# Initialize models and environment
os.system("python -m spacy download en_core_web_sm")
model = YOLO("best.pt")
openai_api_key = os.environ.get("openai_api_key")
MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB

# Utility functions
@st.cache_data(show_spinner=False, ttl=3600)
def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def remove_references(text):
    reference_patterns = [
        r'\bReferences\b', r'\breferences\b', r'\bBibliography\b', 
        r'\bCitations\b', r'\bWorks Cited\b', r'\bReference\b'
    ]
    lines = text.split('\n')
    for i, line in enumerate(lines):
        if any(re.search(pattern, line, re.IGNORECASE) for pattern in reference_patterns):
            return '\n'.join(lines[:i])
    return text

def handle_errors(func):
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            st.session_state.chat_history.append({
                "bot": f"❌ An error occurred: {str(e)}"
            })
            st.rerun()
    return wrapper

def show_progress(message):
    progress_bar = st.progress(0)
    status_text = st.empty()
    for i in range(100):
        time.sleep(0.02)
        progress_bar.progress(i + 1)
        status_text.text(f"{message}... {i+1}%")
    progress_bar.empty()
    status_text.empty()

def scroll_to_bottom():
    ctx = get_script_run_ctx()
    if ctx and runtime.exists():
        js = """
        <script>
            function scrollToBottom() {
                window.parent.document.querySelector('section.main').scrollTo(0, window.parent.document.querySelector('section.main').scrollHeight);
            }
            setTimeout(scrollToBottom, 100);
        </script>
        """
        st.components.v1.html(js, height=0)

# ----------------------------
# Core Processing Functions
# ----------------------------
@st.cache_data(show_spinner=False, ttl=3600)
@handle_errors
def summarize_pdf_with_citations(_pdf_file_path, num_clusters=10):
    """
    Generates a summary that includes in-text citations based on selected context chunks.
    Each context chunk is numbered (e.g. [1], [2], etc.) and is referenced in the summary.
    After the summary, a reference list is provided mapping each citation number to the full original text excerpt.
    """
    embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
    llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
    
    # Updated prompt instructs the LLM to use the full excerpt in the reference list.
    prompt = ChatPromptTemplate.from_template(
        """Generate a comprehensive summary with the following elements:
1. Key findings and conclusions
2. Main methodologies used
3. Important data points
4. Limitations mentioned

For any information that is directly derived from the provided context excerpts, insert an in-text citation in the format [n] where n corresponds to the excerpt number.

After the summary, please provide a reference list where each citation number is mapped to the full original text excerpt as provided below. Do not simply echo the citation number; include the complete excerpt text.

Context Excerpts:
{contexts}"""
    )
    
    loader = PyMuPDFLoader(_pdf_file_path)
    docs = loader.load()
    full_text = "\n".join(doc.page_content for doc in docs)
    cleaned_full_text = clean_text(remove_references(full_text))
    
    text_splitter = SpacyTextSplitter(chunk_size=500)
    split_contents = text_splitter.split_text(cleaned_full_text)
    
    embeddings = embeddings_model.embed_documents(split_contents)
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
    
    citation_indices = []
    for center in kmeans.cluster_centers_:
        distances = np.linalg.norm(embeddings - center, axis=1)
        idx = int(np.argmin(distances))
        citation_indices.append(idx)
    
    # Create a context string with citations including the full original text excerpts
    citation_contexts = []
    for i, idx in enumerate(citation_indices):
        # Using the full excerpt from split_contents for the reference list.
        citation_contexts.append(f"[{i+1}]: {split_contents[idx]}")
    combined_contexts = "\n\n".join(citation_contexts)
    
    chain = prompt | llm | StrOutputParser()
    result = chain.invoke({"contexts": combined_contexts})
    return result



@st.cache_data(show_spinner=False, ttl=3600)
@handle_errors
def summarize_pdf_with_citations(_pdf_file_path, num_clusters=10):
    """
    Generates a summary that includes in-text citations based on selected context chunks.
    Each context chunk is numbered (e.g. [1], [2], etc.) and is referenced in the summary.
    After the summary, a reference list is provided.
    """
    embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
    llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
    
    prompt = ChatPromptTemplate.from_template(
        """Generate a comprehensive summary with the following elements:
1. Key findings and conclusions
2. Main methodologies used
3. Important data points
4. Limitations mentioned

In your summary, include in-text citations formatted as [1], [2], etc., that refer to the source contexts provided below.
After the summary, provide a reference list mapping each citation number to its corresponding context excerpt.

Contexts:
{contexts}"""
    )
    
    loader = PyMuPDFLoader(_pdf_file_path)
    docs = loader.load()
    full_text = "\n".join(doc.page_content for doc in docs)
    cleaned_full_text = clean_text(remove_references(full_text))
    
    text_splitter = SpacyTextSplitter(chunk_size=500)
    split_contents = text_splitter.split_text(cleaned_full_text)
    
    embeddings = embeddings_model.embed_documents(split_contents)
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
    
    citation_indices = []
    for center in kmeans.cluster_centers_:
        distances = np.linalg.norm(embeddings - center, axis=1)
        idx = int(np.argmin(distances))
        citation_indices.append(idx)
    
    # Create a context string with citations (e.g. "[1]: ...", "[2]: ...")
    citation_contexts = []
    for i, idx in enumerate(citation_indices):
        citation_contexts.append(f"[{i+1}]: {split_contents[idx]}")
    combined_contexts = "\n\n".join(citation_contexts)
    
    chain = prompt | llm | StrOutputParser()
    result = chain.invoke({"contexts": combined_contexts})
    return result


@st.cache_data(show_spinner=False, ttl=3600)
@handle_errors
def qa_pdf(_pdf_file_path, query, num_clusters=5):
    embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
    llm = ChatOpenAI(model="gpt-4", api_key=openai_api_key, temperature=0.3)
    
    prompt = ChatPromptTemplate.from_template(
        """Answer this question: {question}
Using only this context: {context}
Format your answer with:
- Clear section headings
- Bullet points for lists
- **Bold** key terms
- Citations from the text"""
    )
    
    loader = PyMuPDFLoader(_pdf_file_path)
    docs = loader.load()
    full_text = "\n".join(doc.page_content for doc in docs)
    cleaned_full_text = clean_text(remove_references(full_text))
    
    text_splitter = SpacyTextSplitter(chunk_size=500)
    split_contents = text_splitter.split_text(cleaned_full_text)
    
    query_embedding = embeddings_model.embed_query(query)
    similarities = cosine_similarity([query_embedding], 
                                     embeddings_model.embed_documents(split_contents))[0]
    top_indices = np.argsort(similarities)[-num_clusters:]
    
    chain = prompt | llm | StrOutputParser()
    return chain.invoke({
        "question": query,
        "context": ' '.join([split_contents[i] for i in top_indices])
    })


@st.cache_data(show_spinner=False, ttl=3600)
@handle_errors
def process_pdf(_pdf_file_path):
    doc = fitz.open(_pdf_file_path)
    all_figures, all_tables = [], []
    scale_factor = 300 / 50  # High-res to low-res ratio
    
    for page in doc:
        low_res = page.get_pixmap(dpi=50)
        low_res_img = np.frombuffer(low_res.samples, dtype=np.uint8).reshape(low_res.height, low_res.width, 3)
        
        results = model.predict(low_res_img)
        boxes = [
            (int(box.xyxy[0][0]), int(box.xyxy[0][1]), 
             int(box.xyxy[0][2]), int(box.xyxy[0][3]), int(box.cls[0]))
            for result in results for box in result.boxes
            if box.conf[0] > 0.8 and int(box.cls[0]) in {3, 4}
        ]
        
        if boxes:
            high_res = page.get_pixmap(dpi=300)
            high_res_img = np.frombuffer(high_res.samples, dtype=np.uint8).reshape(high_res.height, high_res.width, 3)
            
            for (x1, y1, x2, y2, cls) in boxes:
                cropped = high_res_img[int(y1*scale_factor):int(y2*scale_factor),
                                         int(x1*scale_factor):int(x2*scale_factor)]
                if cls == 4:
                    all_figures.append(cropped)
                else:
                    all_tables.append(cropped)
    
    return all_figures, all_tables

def image_to_base64(img):
    buffered = io.BytesIO()
    img = Image.fromarray(img).convert("RGB")
    img.thumbnail((800, 800))  # Optimize image size
    img.save(buffered, format="JPEG", quality=85)
    return base64.b64encode(buffered.getvalue()).decode()

# ----------------------------
# Streamlit UI Setup
# ----------------------------
st.set_page_config(
    page_title="PDF Assistant",
    page_icon="📄",
    layout="wide",
    initial_sidebar_state="expanded"
)

if 'chat_history' not in st.session_state:
    st.session_state.chat_history = []
if 'current_file' not in st.session_state:
    st.session_state.current_file = None

st.title("📄 Smart PDF Analyzer")
st.markdown("""
<div style="border-left: 4px solid #4CAF50; padding-left: 1rem; margin: 1rem 0;">
    <p style="color: #666; font-size: 0.95rem;">✨ Upload a PDF to:
    <ul style="color: #666; font-size: 0.95rem;">
        <li>Generate structured summaries</li>
        <li>Extract visual content</li>
        <li>Ask contextual questions</li>
    </ul>
    </p>
</div>
""", unsafe_allow_html=True)

uploaded_file = st.file_uploader(
    "Choose PDF file", 
    type="pdf",
    help="Max file size: 50MB",
    on_change=lambda: setattr(st.session_state, 'chat_history', [])
)

if uploaded_file and uploaded_file.size > MAX_FILE_SIZE:
    st.error("File size exceeds 50MB limit")
    st.stop()

if uploaded_file:
    file_path = tempfile.NamedTemporaryFile(delete=False).name
    with open(file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())
    
    # Let the user choose whether to include in-text citations in the summary
    include_citations = st.checkbox("Include in-text citations in summary", value=True)
    
    chat_container = st.container()
    with chat_container:
        for idx, chat in enumerate(st.session_state.chat_history):
            col1, col2 = st.columns([1, 4])
            if chat.get("user"):
                with col2:
                    message(chat["user"], is_user=True, key=f"user_{idx}")
            if chat.get("bot"):
                with col1:
                    message(chat["bot"], key=f"bot_{idx}", allow_html=True)
        scroll_to_bottom()

    with st.container():
        col1, col2, col3 = st.columns([3, 2, 2])
        with col1:
            user_input = st.chat_input("Ask about the document...")
        with col2:
            if st.button("📝 Generate Summary", use_container_width=True):
                with st.spinner("Analyzing document structure..."):
                    show_progress("Generating summary")
                    if include_citations:
                        summary = summarize_pdf_with_citations(file_path)
                    else:
                        summary = summarize_pdf(file_path)
                    st.session_state.chat_history.append({
                        "user": "Summary request", 
                        "bot": f"## Document Summary\n{summary}"
                    })
                    st.rerun()
        with col3:
            if st.button("🖼️ Extract Visuals", use_container_width=True):
                with st.spinner("Identifying figures and tables..."):
                    show_progress("Extracting visuals")
                    figures, tables = process_pdf(file_path)
                    if figures:
                        st.session_state.chat_history.append({
                            "bot": f"Found {len(figures)} figures:"
                        })
                        for fig in figures:
                            st.session_state.chat_history.append({
                                "bot": f'<img src="data:image/jpeg;base64,{image_to_base64(fig)}" style="max-width: 100%;">'
                            })
                    if tables:
                        st.session_state.chat_history.append({
                            "bot": f"Found {len(tables)} tables:"
                        })
                        for tab in tables:
                            st.session_state.chat_history.append({
                                "bot": f'<img src="data:image/jpeg;base64,{image_to_base64(tab)}" style="max-width: 100%;">'
                            })
                    st.rerun()

    if user_input:
        st.session_state.chat_history.append({"user": user_input})
        with st.spinner("Analyzing query..."):
            show_progress("Generating answer")
            answer = qa_pdf(file_path, user_input)
            st.session_state.chat_history[-1]["bot"] = f"## Answer\n{answer}"
            st.rerun()

st.markdown("""
<style>
    .stChatMessage {
        padding: 1.25rem;
        margin: 1rem 0;
        border-radius: 12px;
        box-shadow: 0 2px 8px rgba(0,0,0,0.1);
        transition: transform 0.2s ease;
    }
    .stChatMessage:hover {
        transform: translateY(-2px);
    }
    .stButton>button {
        background: linear-gradient(45deg, #4CAF50, #45a049);
        color: white;
        border: none;
        border-radius: 8px;
        padding: 12px 24px;
        font-size: 16px;
        transition: all 0.3s ease;
    }
    .stButton>button:hover {
        box-shadow: 0 4px 12px rgba(76,175,80,0.3);
        transform: translateY(-1px);
    }
    [data-testid="stFileUploader"] {
        border: 2px dashed #4CAF50;
        border-radius: 12px;
        padding: 2rem;
    }
</style>
""", unsafe_allow_html=True)