import os
import time
import io
import base64
import re
import numpy as np
import fitz # PyMuPDF
import tempfile
from PIL import Image
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from ultralytics import YOLO
import streamlit as st
from streamlit_chat import message
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import SpacyTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from streamlit.runtime.scriptrunner import get_script_run_ctx
from streamlit import runtime
# Initialize models and environment
os.system("python -m spacy download en_core_web_sm")
model = YOLO("best.pt")
openai_api_key = os.environ.get("openai_api_key")
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
# Utility functions
@st.cache_data(show_spinner=False, ttl=3600)
def clean_text(text):
return re.sub(r'\s+', ' ', text).strip()
def remove_references(text):
reference_patterns = [
r'\bReferences\b', r'\breferences\b', r'\bBibliography\b',
r'\bCitations\b', r'\bWorks Cited\b', r'\bReference\b'
]
lines = text.split('\n')
for i, line in enumerate(lines):
if any(re.search(pattern, line, re.IGNORECASE) for pattern in reference_patterns):
return '\n'.join(lines[:i])
return text
def handle_errors(func):
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as e:
st.session_state.chat_history.append({
"bot": f"❌ An error occurred: {str(e)}"
})
st.rerun()
return wrapper
def scroll_to_bottom():
ctx = get_script_run_ctx()
if ctx and runtime.exists():
js = """
"""
st.components.v1.html(js, height=0)
# Core processing functions
@st.cache_data(show_spinner=False, ttl=3600)
@handle_errors
def summarize_pdf(_pdf_file_path, num_clusters=10):
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
llm = ChatOpenAI(model="gpt-4", api_key=openai_api_key, temperature=0.3)
# Load PDF with page numbers
loader = PyMuPDFLoader(_pdf_file_path)
docs = loader.load()
# Create chunks with page metadata
text_splitter = SpacyTextSplitter(chunk_size=500)
chunks_with_metadata = []
for doc in docs:
chunks = text_splitter.split_text(doc.page_content)
for chunk in chunks:
chunks_with_metadata.append({
"text": clean_text(chunk),
"page": doc.metadata["page"] + 1 # Convert to 1-based numbering
})
# Prepare prompt with citation instructions
prompt = ChatPromptTemplate.from_template(
"""Generate a comprehensive summary with inline citations using [Source X] format.
Include these elements:
1. Key findings and conclusions
2. Main methodologies used
3. Important data points
4. Limitations mentioned
Structure your response as:
## Comprehensive Summary
{summary_content}
Contexts: {topic}"""
)
# Generate summary
chain = prompt | llm | StrOutputParser()
raw_summary = chain.invoke({
"topic": ' '.join([chunk["text"] for chunk in chunks_with_metadata])
})
return generate_interactive_citations(raw_summary, chunks_with_metadata)
def generate_interactive_citations(summary_text, source_chunks):
# Create source entries with page numbers and full text
sources_html = """
📖 Source References
"""
source_mapping = {}
for idx, chunk in enumerate(source_chunks):
source_id = f"source-{idx+1}"
source_mapping[idx+1] = {
"id": source_id,
"page": chunk["page"],
"text": chunk["text"]
}
sources_html += f"""
Source {idx+1}
Page {chunk['page']}
{chunk["text"]}
"""
sources_html += "
"
# Add click interactions
interaction_js = """
"""
# Replace citations with interactive links
cited_summary = re.sub(r'\[Source (\d+)\]',
lambda m: f'[Source {m.group(1)}]',
summary_text)
return f"""
{cited_summary}
{sources_html}
{interaction_js}
"""
@st.cache_data(show_spinner=False, ttl=3600)
@handle_errors
def qa_pdf(_pdf_file_path, query, num_clusters=5):
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
llm = ChatOpenAI(model="gpt-4", api_key=openai_api_key, temperature=0.3)
# Load PDF with page numbers
loader = PyMuPDFLoader(_pdf_file_path)
docs = loader.load()
# Create chunks with page metadata
text_splitter = SpacyTextSplitter(chunk_size=500)
chunks_with_metadata = []
for doc in docs:
chunks = text_splitter.split_text(doc.page_content)
for chunk in chunks:
chunks_with_metadata.append({
"text": clean_text(chunk),
"page": doc.metadata["page"] + 1
})
# Find relevant chunks
embeddings = embeddings_model.embed_documents([chunk["text"] for chunk in chunks_with_metadata])
query_embedding = embeddings_model.embed_query(query)
similarities = cosine_similarity([query_embedding], embeddings)[0]
top_indices = np.argsort(similarities)[-num_clusters:]
# Prepare prompt with citation instructions
prompt = ChatPromptTemplate.from_template(
"""Answer this question with inline citations using [Source X] format:
{question}
Use these verified sources:
{context}
Structure your answer with:
- Clear section headings
- Bullet points for lists
- Citations for all factual claims"""
)
chain = prompt | llm | StrOutputParser()
raw_answer = chain.invoke({
"question": query,
"context": '\n\n'.join([f"Source {i+1} (Page {chunks_with_metadata[i]['page']}): {chunks_with_metadata[i]['text']}"
for i in top_indices])
})
return generate_interactive_citations(raw_answer, [chunks_with_metadata[i] for i in top_indices])
# (Keep the rest of the code from previous implementation for PDF processing and UI)
# [Include the process_pdf, image_to_base64, and Streamlit UI code from previous response]
# [Make sure to maintain all the UI improvements and error handling]
# Streamlit UI Configuration
st.set_page_config(
page_title="PDF Research Assistant",
page_icon="📄",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS Styles
st.markdown("""
""", unsafe_allow_html=True)
# Session state initialization
if 'chat_history' not in st.session_state:
st.session_state.chat_history = []
if 'current_file' not in st.session_state:
st.session_state.current_file = None
# Main UI
st.title("📄 Academic PDF Analyzer")
st.markdown("""
🔍 Upload research papers to:
- Generate citations-backed summaries
- Trace claims to original sources
- Extract data tables and figures
- Q&A with verifiable references
""", unsafe_allow_html=True)
# File uploader
uploaded_file = st.file_uploader(
"Upload research PDF",
type="pdf",
help="Maximum file size: 50MB",
on_change=lambda: setattr(st.session_state, 'chat_history', [])
)
if uploaded_file and uploaded_file.size > MAX_FILE_SIZE:
st.error("File size exceeds 50MB limit")
st.stop()
# Document processing
if uploaded_file:
file_path = tempfile.NamedTemporaryFile(delete=False).name
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
# Chat interface
chat_container = st.container()
with chat_container:
for idx, chat in enumerate(st.session_state.chat_history):
col1, col2 = st.columns([1, 4])
if chat.get("user"):
with col2:
message(chat["user"], is_user=True, key=f"user_{idx}")
if chat.get("bot"):
with col1:
message(chat["bot"], key=f"bot_{idx}", allow_html=True)
scroll_to_bottom()
# Interaction controls
with st.container():
col1, col2, col3 = st.columns([3, 2, 2])
with col1:
user_input = st.chat_input("Ask a research question...")
with col2:
if st.button("📄 Generate Summary", use_container_width=True):
with st.spinner("Analyzing document structure..."):
summary = summarize_pdf(file_path)
st.session_state.chat_history.append({
"bot": f"## Research Summary\n{summary}"
})
st.rerun()
with col3:
if st.button("🔄 Clear Session", use_container_width=True):
st.session_state.chat_history = []
st.rerun()
# Handle user questions
if user_input:
st.session_state.chat_history.append({"user": user_input})
with st.spinner("Verifying sources..."):
answer = qa_pdf(file_path, user_input)
st.session_state.chat_history[-1]["bot"] = f"## Research Answer\n{answer}"
st.rerun()