DOCSUM / app.py
adil9858's picture
Update app.py
abd0964 verified
raw
history blame
6.53 kB
import gradio as gr
from openai import OpenAI
import base64
from PIL import Image
import io
import fitz # PyMuPDF
import tempfile
import os
# --- HELPER FUNCTIONS ---
def convert_pdf_to_images(pdf_file):
"""Convert PDF to list of PIL Images"""
images = []
try:
# Save uploaded file to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
tmp_file.write(pdf_file)
tmp_file_path = tmp_file.name
# Open the PDF file
pdf_document = fitz.open(tmp_file_path)
# Iterate through each page
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
pix = page.get_pixmap()
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
images.append(img)
# Clean up
pdf_document.close()
os.unlink(tmp_file_path)
except Exception as e:
raise gr.Error(f"Error converting PDF: {e}")
return images
def image_to_base64(image):
"""Convert PIL Image to base64 string"""
with io.BytesIO() as buffer:
image.save(buffer, format="PNG")
return base64.b64encode(buffer.getvalue()).decode("utf-8")
def generate_summary(extracted_texts, api_key):
"""Generate a comprehensive summary of all extracted texts"""
try:
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=api_key
)
summary_prompt = f"""
You are an expert document analyst. Below are the extracted contents from multiple pages of a document.
Please provide a comprehensive, detailed summary that:
1. Organizes all key information logically
2. Identifies relationships between data points
3. Highlights important figures, dates, names
4. Presents the information in a clear, structured format
Extracted contents from pages:
{extracted_texts}
Comprehensive Summary:
"""
response = client.chat.completions.create(
model="opengvlab/internvl3-14b:free",
messages=[
{"role": "system", "content": "You are Dalton, an expert in analyzing and summarizing document contents."},
{"role": "user", "content": summary_prompt}
],
max_tokens=2048
)
return response.choices[0].message.content
except Exception as e:
raise gr.Error(f"Error generating summary: {e}")
def analyze_document(api_key, user_prompt, uploaded_file):
"""Main processing function"""
if not api_key:
raise gr.Error("Please enter your OpenRouter API key")
if uploaded_file is None:
raise gr.Error("Please upload a document")
images_to_analyze = []
file_ext = os.path.splitext(uploaded_file.name)[1].lower()
# Handle PDF or image
if file_ext == '.pdf':
with open(uploaded_file.name, "rb") as f:
pdf_data = f.read()
pdf_images = convert_pdf_to_images(pdf_data)
images_to_analyze = pdf_images # For simplicity, using all pages
else:
image = Image.open(uploaded_file.name)
images_to_analyze = [image]
# Process each image
all_results = []
extracted_texts = []
for idx, image in enumerate(images_to_analyze, 1):
try:
client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=api_key
)
image_base64 = image_to_base64(image)
response = client.chat.completions.create(
model="opengvlab/internvl3-14b:free",
messages=[
{"role": "system", "content": "You are Dalton, an expert in understanding images that can analyze images and provide detailed descriptions."},
{"role": "user", "content": [
{"type": "text", "text": user_prompt},
{"type": "image_url", "image_url": {
"url": f"data:image/png;base64,{image_base64}"
}}
]}
],
max_tokens=1024
)
result = response.choices[0].message.content
extracted_texts.append(f"=== Page {idx} ===\n{result}\n")
all_results.append(f"πŸ“„ Page {idx} Result:\n{result}\n---\n")
except Exception as e:
raise gr.Error(f"Error analyzing page {idx}: {e}")
# Generate summary if multiple pages
final_output = "\n".join(all_results)
if len(extracted_texts) > 1:
summary = generate_summary("\n".join(extracted_texts), api_key)
final_output += f"\nπŸ“ Comprehensive Summary:\n{summary}"
return final_output
# --- GRADIO INTERFACE ---
with gr.Blocks(title="DocSum - Document Summarizer", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🧾 DocSum")
gr.Markdown("Document Summarizer Powered by VLM β€’ Developed by [Koshur AI](https://koshurai.com)")
with gr.Row():
api_key = gr.Textbox(
label="πŸ”‘ OpenRouter API Key",
type="password",
placeholder="Enter your OpenRouter API key"
)
user_prompt = gr.Textbox(
label="πŸ“ Enter Your Prompt",
value="Extract all content structurally",
placeholder="What would you like to extract?"
)
uploaded_file = gr.File(
label="Upload Document (PDF/Image)",
file_types=[".pdf", ".jpg", ".jpeg", ".png"]
)
submit_btn = gr.Button("πŸ” Analyze Document", variant="primary")
# Replace Textbox with Markdown output
output = gr.Markdown(
label="Analysis Results",
elem_classes=["markdown-output"]
)
submit_btn.click(
fn=analyze_document,
inputs=[api_key, user_prompt, uploaded_file],
outputs=output
)
# Add custom CSS for the markdown output
css = """
.markdown-output {
padding: 20px;
border-radius: 8px;
background: #f9fafb;
border: 1px solid #e5e7eb;
max-height: 600px;
overflow-y: auto;
}
.markdown-output h2 {
color: #2563eb;
margin-top: 1.5em;
margin-bottom: 0.5em;
}
.markdown-output h3 {
color: #3b82f6;
margin-top: 1em;
}
"""
demo.css = css
if __name__ == "__main__":
demo.launch()