Spaces:

adil9858
/

DOCSUM

Sleeping

App Files Files Community

DOCSUM / app.py

adil9858

Update app.py

abd0964 verified 28 days ago

raw

history blame

6.53 kB

	import gradio as gr
	from openai import OpenAI
	import base64
	from PIL import Image
	import io
	import fitz # PyMuPDF
	import tempfile
	import os

	# --- HELPER FUNCTIONS ---
	def convert_pdf_to_images(pdf_file):
	"""Convert PDF to list of PIL Images"""
	images = []
	try:
	# Save uploaded file to a temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
	tmp_file.write(pdf_file)
	tmp_file_path = tmp_file.name

	# Open the PDF file
	pdf_document = fitz.open(tmp_file_path)

	# Iterate through each page
	for page_num in range(len(pdf_document)):
	page = pdf_document.load_page(page_num)
	pix = page.get_pixmap()
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	images.append(img)

	# Clean up
	pdf_document.close()
	os.unlink(tmp_file_path)

	except Exception as e:
	raise gr.Error(f"Error converting PDF: {e}")
	return images

	def image_to_base64(image):
	"""Convert PIL Image to base64 string"""
	with io.BytesIO() as buffer:
	image.save(buffer, format="PNG")
	return base64.b64encode(buffer.getvalue()).decode("utf-8")

	def generate_summary(extracted_texts, api_key):
	"""Generate a comprehensive summary of all extracted texts"""
	try:
	client = OpenAI(
	base_url="https://openrouter.ai/api/v1",
	api_key=api_key
	)

	summary_prompt = f"""
	You are an expert document analyst. Below are the extracted contents from multiple pages of a document.
	Please provide a comprehensive, detailed summary that:
	1. Organizes all key information logically
	2. Identifies relationships between data points
	3. Highlights important figures, dates, names
	4. Presents the information in a clear, structured format

	Extracted contents from pages:
	{extracted_texts}

	Comprehensive Summary:
	"""

	response = client.chat.completions.create(
	model="opengvlab/internvl3-14b:free",
	messages=[
	{"role": "system", "content": "You are Dalton, an expert in analyzing and summarizing document contents."},
	{"role": "user", "content": summary_prompt}
	],
	max_tokens=2048
	)

	return response.choices[0].message.content

	except Exception as e:
	raise gr.Error(f"Error generating summary: {e}")

	def analyze_document(api_key, user_prompt, uploaded_file):
	"""Main processing function"""
	if not api_key:
	raise gr.Error("Please enter your OpenRouter API key")

	if uploaded_file is None:
	raise gr.Error("Please upload a document")

	images_to_analyze = []
	file_ext = os.path.splitext(uploaded_file.name)[1].lower()

	# Handle PDF or image
	if file_ext == '.pdf':
	with open(uploaded_file.name, "rb") as f:
	pdf_data = f.read()
	pdf_images = convert_pdf_to_images(pdf_data)
	images_to_analyze = pdf_images # For simplicity, using all pages
	else:
	image = Image.open(uploaded_file.name)
	images_to_analyze = [image]

	# Process each image
	all_results = []
	extracted_texts = []

	for idx, image in enumerate(images_to_analyze, 1):
	try:
	client = OpenAI(
	base_url="https://openrouter.ai/api/v1",
	api_key=api_key
	)

	image_base64 = image_to_base64(image)

	response = client.chat.completions.create(
	model="opengvlab/internvl3-14b:free",
	messages=[
	{"role": "system", "content": "You are Dalton, an expert in understanding images that can analyze images and provide detailed descriptions."},
	{"role": "user", "content": [
	{"type": "text", "text": user_prompt},
	{"type": "image_url", "image_url": {
	"url": f"data:image/png;base64,{image_base64}"
	}}
	]}
	],
	max_tokens=1024
	)

	result = response.choices[0].message.content
	extracted_texts.append(f"=== Page {idx} ===\n{result}\n")
	all_results.append(f"📄 Page {idx} Result:\n{result}\n---\n")

	except Exception as e:
	raise gr.Error(f"Error analyzing page {idx}: {e}")

	# Generate summary if multiple pages
	final_output = "\n".join(all_results)

	if len(extracted_texts) > 1:
	summary = generate_summary("\n".join(extracted_texts), api_key)
	final_output += f"\n📝 Comprehensive Summary:\n{summary}"

	return final_output

	# --- GRADIO INTERFACE ---
	with gr.Blocks(title="DocSum - Document Summarizer", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🧾 DocSum")
	gr.Markdown("Document Summarizer Powered by VLM • Developed by [Koshur AI](https://koshurai.com)")

	with gr.Row():
	api_key = gr.Textbox(
	label="🔑 OpenRouter API Key",
	type="password",
	placeholder="Enter your OpenRouter API key"
	)
	user_prompt = gr.Textbox(
	label="📝 Enter Your Prompt",
	value="Extract all content structurally",
	placeholder="What would you like to extract?"
	)

	uploaded_file = gr.File(
	label="Upload Document (PDF/Image)",
	file_types=[".pdf", ".jpg", ".jpeg", ".png"]
	)

	submit_btn = gr.Button("🔍 Analyze Document", variant="primary")

	# Replace Textbox with Markdown output
	output = gr.Markdown(
	label="Analysis Results",
	elem_classes=["markdown-output"]
	)

	submit_btn.click(
	fn=analyze_document,
	inputs=[api_key, user_prompt, uploaded_file],
	outputs=output
	)

	# Add custom CSS for the markdown output
	css = """
	.markdown-output {
	padding: 20px;
	border-radius: 8px;
	background: #f9fafb;
	border: 1px solid #e5e7eb;
	max-height: 600px;
	overflow-y: auto;
	}
	.markdown-output h2 {
	color: #2563eb;
	margin-top: 1.5em;
	margin-bottom: 0.5em;
	}
	.markdown-output h3 {
	color: #3b82f6;
	margin-top: 1em;
	}
	"""
	demo.css = css

	if __name__ == "__main__":
	demo.launch()