Spaces:

zliang
/

PDFReadingAssistant

Paused

App Files Files Community

PDFReadingAssistant / app.py

zliang

Update app.py

a2d9aa7 verified 3 months ago

raw

history blame contribute delete

13.4 kB

	import os
	import time
	import io
	import base64
	import re
	import numpy as np
	import fitz # PyMuPDF
	import tempfile
	from PIL import Image
	from sklearn.cluster import KMeans
	from sklearn.metrics.pairwise import cosine_similarity
	from ultralytics import YOLO
	import streamlit as st
	from streamlit_chat import message
	from langchain_core.output_parsers import StrOutputParser
	from langchain_community.document_loaders import PyMuPDFLoader
	from langchain_openai import OpenAIEmbeddings, ChatOpenAI
	from langchain_text_splitters import SpacyTextSplitter
	from langchain_core.prompts import ChatPromptTemplate
	from streamlit.runtime.scriptrunner import get_script_run_ctx
	from streamlit import runtime

	# Initialize models and environment
	os.system("python -m spacy download en_core_web_sm")
	model = YOLO("best.pt")
	openai_api_key = os.environ.get("openai_api_key")
	MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB

	# Utility functions
	@st.cache_data(show_spinner=False, ttl=3600)
	def clean_text(text):
	return re.sub(r'\s+', ' ', text).strip()

	def remove_references(text):
	reference_patterns = [
	r'\bReferences\b', r'\breferences\b', r'\bBibliography\b',
	r'\bCitations\b', r'\bWorks Cited\b', r'\bReference\b'
	]
	lines = text.split('\n')
	for i, line in enumerate(lines):
	if any(re.search(pattern, line, re.IGNORECASE) for pattern in reference_patterns):
	return '\n'.join(lines[:i])
	return text

	def handle_errors(func):
	def wrapper(args, *kwargs):
	try:
	return func(args, *kwargs)
	except Exception as e:
	st.session_state.chat_history.append({
	"bot": f"❌ An error occurred: {str(e)}"
	})
	st.rerun()
	return wrapper

	def show_progress(message):
	progress_bar = st.progress(0)
	status_text = st.empty()
	for i in range(100):
	time.sleep(0.02)
	progress_bar.progress(i + 1)
	status_text.text(f"{message}... {i+1}%")
	progress_bar.empty()
	status_text.empty()

	def scroll_to_bottom():
	ctx = get_script_run_ctx()
	if ctx and runtime.exists():
	js = """
	<script>
	function scrollToBottom() {
	window.parent.document.querySelector('section.main').scrollTo(0, window.parent.document.querySelector('section.main').scrollHeight);
	}
	setTimeout(scrollToBottom, 100);
	</script>
	"""
	st.components.v1.html(js, height=0)

	# ----------------------------
	# Core Processing Functions
	# ----------------------------
	@st.cache_data(show_spinner=False, ttl=3600)
	@handle_errors

	@st.cache_data(show_spinner=False, ttl=3600)
	@handle_errors
	def summarize_pdf_with_tooltips(_pdf_file_path, num_clusters=10):
	"""
	Generates a summary with in-text citations that display the full excerpt as a tooltip on hover.
	Each citation is embedded as an HTML span element with the tooltip text.
	"""
	embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
	llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)

	prompt = ChatPromptTemplate.from_template(
	"""Generate a comprehensive summary that includes the following:
	1. Key findings and conclusions
	2. Main methodologies used
	3. Important data points
	4. Limitations mentioned
	For any information directly derived from the context excerpts provided below, insert an in-text citation as an HTML tooltip.
	For each citation, use the following HTML format:
	<span class="tooltip" data-tooltip="{full_text}">[{n}]</span>
	Where:
	- {n} is the citation number.
	- {full_text} is the complete excerpt text for that citation.
	Do not provide a separate reference list. Instead, embed the full citation text directly in the tooltip.
	Context Excerpts:
	{contexts}"""
	)

	loader = PyMuPDFLoader(_pdf_file_path)
	docs = loader.load()
	full_text = "\n".join(doc.page_content for doc in docs)
	cleaned_full_text = clean_text(remove_references(full_text))

	text_splitter = SpacyTextSplitter(chunk_size=500)
	split_contents = text_splitter.split_text(cleaned_full_text)

	embeddings = embeddings_model.embed_documents(split_contents)
	kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)

	citation_indices = []
	for center in kmeans.cluster_centers_:
	distances = np.linalg.norm(embeddings - center, axis=1)
	idx = int(np.argmin(distances))
	citation_indices.append(idx)

	# Build the context excerpts string.
	citation_contexts = []
	for i, idx in enumerate(citation_indices):
	# Replace double quotes to avoid breaking HTML attribute quotes.
	excerpt = split_contents[idx].replace('"', "'")
	citation_contexts.append(f"[{i+1}]: {excerpt}")
	combined_contexts = "\n\n".join(citation_contexts)

	chain = prompt \| llm \| StrOutputParser()
	result = chain.invoke({"contexts": combined_contexts})
	return result





	@st.cache_data(show_spinner=False, ttl=3600)
	@handle_errors
	def qa_pdf(_pdf_file_path, query, num_clusters=5):
	embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
	llm = ChatOpenAI(model="gpt-4", api_key=openai_api_key, temperature=0.3)

	prompt = ChatPromptTemplate.from_template(
	"""Answer this question: {question}
	Using only this context: {context}
	Format your answer with:
	- Clear section headings
	- Bullet points for lists
	- Bold key terms
	- Citations from the text"""
	)

	loader = PyMuPDFLoader(_pdf_file_path)
	docs = loader.load()
	full_text = "\n".join(doc.page_content for doc in docs)
	cleaned_full_text = clean_text(remove_references(full_text))

	text_splitter = SpacyTextSplitter(chunk_size=500)
	split_contents = text_splitter.split_text(cleaned_full_text)

	query_embedding = embeddings_model.embed_query(query)
	similarities = cosine_similarity([query_embedding],
	embeddings_model.embed_documents(split_contents))[0]
	top_indices = np.argsort(similarities)[-num_clusters:]

	chain = prompt \| llm \| StrOutputParser()
	return chain.invoke({
	"question": query,
	"context": ' '.join([split_contents[i] for i in top_indices])
	})


	@st.cache_data(show_spinner=False, ttl=3600)
	@handle_errors
	def process_pdf(_pdf_file_path):
	doc = fitz.open(_pdf_file_path)
	all_figures, all_tables = [], []
	scale_factor = 300 / 50 # High-res to low-res ratio

	for page in doc:
	low_res = page.get_pixmap(dpi=50)
	low_res_img = np.frombuffer(low_res.samples, dtype=np.uint8).reshape(low_res.height, low_res.width, 3)

	results = model.predict(low_res_img)
	boxes = [
	(int(box.xyxy[0][0]), int(box.xyxy[0][1]),
	int(box.xyxy[0][2]), int(box.xyxy[0][3]), int(box.cls[0]))
	for result in results for box in result.boxes
	if box.conf[0] > 0.8 and int(box.cls[0]) in {3, 4}
	]

	if boxes:
	high_res = page.get_pixmap(dpi=300)
	high_res_img = np.frombuffer(high_res.samples, dtype=np.uint8).reshape(high_res.height, high_res.width, 3)

	for (x1, y1, x2, y2, cls) in boxes:
	cropped = high_res_img[int(y1scale_factor):int(y2scale_factor),
	int(x1scale_factor):int(x2scale_factor)]
	if cls == 4:
	all_figures.append(cropped)
	else:
	all_tables.append(cropped)

	return all_figures, all_tables

	def image_to_base64(img):
	buffered = io.BytesIO()
	img = Image.fromarray(img).convert("RGB")
	img.thumbnail((800, 800)) # Optimize image size
	img.save(buffered, format="JPEG", quality=85)
	return base64.b64encode(buffered.getvalue()).decode()

	# ----------------------------
	# Streamlit UI Setup
	# ----------------------------
	st.set_page_config(
	page_title="PDF Assistant",
	page_icon="📄",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	if 'chat_history' not in st.session_state:
	st.session_state.chat_history = []
	if 'current_file' not in st.session_state:
	st.session_state.current_file = None

	st.title("📄 Smart PDF Analyzer")
	st.markdown("""
	<div style="border-left: 4px solid #4CAF50; padding-left: 1rem; margin: 1rem 0;">
	<p style="color: #666; font-size: 0.95rem;">✨ Upload a PDF to:
	<ul style="color: #666; font-size: 0.95rem;">
	<li>Generate structured summaries</li>
	<li>Extract visual content</li>
	<li>Ask contextual questions</li>
	</ul>
	</p>
	</div>
	""", unsafe_allow_html=True)

	uploaded_file = st.file_uploader(
	"Choose PDF file",
	type="pdf",
	help="Max file size: 50MB",
	on_change=lambda: setattr(st.session_state, 'chat_history', [])
	)

	if uploaded_file and uploaded_file.size > MAX_FILE_SIZE:
	st.error("File size exceeds 50MB limit")
	st.stop()

	if uploaded_file:
	file_path = tempfile.NamedTemporaryFile(delete=False).name
	with open(file_path, "wb") as f:
	f.write(uploaded_file.getbuffer())

	# Let the user choose whether to include in-text citations in the summary
	include_citations = st.checkbox("Include in-text citations in summary", value=True)

	chat_container = st.container()
	with chat_container:
	for idx, chat in enumerate(st.session_state.chat_history):
	col1, col2 = st.columns([1, 4])
	if chat.get("user"):
	with col2:
	message(chat["user"], is_user=True, key=f"user_{idx}")
	if chat.get("bot"):
	with col1:
	message(chat["bot"], key=f"bot_{idx}", allow_html=True)
	scroll_to_bottom()

	with st.container():
	col1, col2, col3 = st.columns([3, 2, 2])
	with col1:
	user_input = st.chat_input("Ask about the document...")
	with col2:
	if st.button("📝 Generate Summary", use_container_width=True):
	with st.spinner("Analyzing document structure..."):
	show_progress("Generating summary")
	summary = summarize_pdf_with_tooltips(file_path)
	st.session_state.chat_history.append({
	"user": "Summary request",
	"bot": f"## Document Summary\n{summary}"
	})
	st.rerun()

	with col3:
	if st.button("🖼️ Extract Visuals", use_container_width=True):
	with st.spinner("Identifying figures and tables..."):
	show_progress("Extracting visuals")
	figures, tables = process_pdf(file_path)
	if figures:
	st.session_state.chat_history.append({
	"bot": f"Found {len(figures)} figures:"
	})
	for fig in figures:
	st.session_state.chat_history.append({
	"bot": f'<img src="data:image/jpeg;base64,{image_to_base64(fig)}" style="max-width: 100%;">'
	})
	if tables:
	st.session_state.chat_history.append({
	"bot": f"Found {len(tables)} tables:"
	})
	for tab in tables:
	st.session_state.chat_history.append({
	"bot": f'<img src="data:image/jpeg;base64,{image_to_base64(tab)}" style="max-width: 100%;">'
	})
	st.rerun()

	if user_input:
	st.session_state.chat_history.append({"user": user_input})
	with st.spinner("Analyzing query..."):
	show_progress("Generating answer")
	answer = qa_pdf(file_path, user_input)
	st.session_state.chat_history[-1]["bot"] = f"## Answer\n{answer}"
	st.rerun()

	st.markdown("""
	<style>
	.stChatMessage {
	padding: 1.25rem;
	margin: 1rem 0;
	border-radius: 12px;
	box-shadow: 0 2px 8px rgba(0,0,0,0.1);
	transition: transform 0.2s ease;
	}
	.stChatMessage:hover {
	transform: translateY(-2px);
	}
	.stButton>button {
	background: linear-gradient(45deg, #4CAF50, #45a049);
	color: white;
	border: none;
	border-radius: 8px;
	padding: 12px 24px;
	font-size: 16px;
	transition: all 0.3s ease;
	}
	.stButton>button:hover {
	box-shadow: 0 4px 12px rgba(76,175,80,0.3);
	transform: translateY(-1px);
	}
	[data-testid="stFileUploader"] {
	border: 2px dashed #4CAF50;
	border-radius: 12px;
	padding: 2rem;
	}
	.tooltip {
	position: relative;
	cursor: pointer;
	border-bottom: 1px dotted #555;
	}

	/* Tooltip text */
	.tooltip:hover::after {
	content: attr(data-tooltip);
	position: absolute;
	left: 0;
	top: 1.5em;
	background: #333;
	color: #fff;
	padding: 5px 10px;
	border-radius: 5px;
	white-space: pre-wrap;
	z-index: 100;
	width: 300px; /* Adjust width as needed */
	}
	</style>
	""", unsafe_allow_html=True)