|
import gradio as gr |
|
from openai import OpenAI |
|
import base64 |
|
from PIL import Image |
|
import io |
|
import fitz |
|
import tempfile |
|
import os |
|
|
|
|
|
def convert_pdf_to_images(pdf_file): |
|
"""Convert PDF to list of PIL Images""" |
|
images = [] |
|
try: |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: |
|
tmp_file.write(pdf_file) |
|
tmp_file_path = tmp_file.name |
|
|
|
|
|
pdf_document = fitz.open(tmp_file_path) |
|
|
|
|
|
for page_num in range(len(pdf_document)): |
|
page = pdf_document.load_page(page_num) |
|
pix = page.get_pixmap() |
|
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) |
|
images.append(img) |
|
|
|
|
|
pdf_document.close() |
|
os.unlink(tmp_file_path) |
|
|
|
except Exception as e: |
|
raise gr.Error(f"Error converting PDF: {e}") |
|
return images |
|
|
|
def image_to_base64(image): |
|
"""Convert PIL Image to base64 string""" |
|
with io.BytesIO() as buffer: |
|
image.save(buffer, format="PNG") |
|
return base64.b64encode(buffer.getvalue()).decode("utf-8") |
|
|
|
def generate_summary(extracted_texts, api_key): |
|
"""Generate a comprehensive summary of all extracted texts""" |
|
try: |
|
client = OpenAI( |
|
base_url="https://openrouter.ai/api/v1", |
|
api_key=api_key |
|
) |
|
|
|
summary_prompt = f""" |
|
You are an expert document analyst. Below are the extracted contents from multiple pages of a document. |
|
Please provide a comprehensive, detailed summary that: |
|
1. Organizes all key information logically |
|
2. Identifies relationships between data points |
|
3. Highlights important figures, dates, names |
|
4. Presents the information in a clear, structured format |
|
|
|
Extracted contents from pages: |
|
{extracted_texts} |
|
|
|
Comprehensive Summary: |
|
""" |
|
|
|
response = client.chat.completions.create( |
|
model="opengvlab/internvl3-14b:free", |
|
messages=[ |
|
{"role": "system", "content": "You are Dalton, an expert in analyzing and summarizing document contents."}, |
|
{"role": "user", "content": summary_prompt} |
|
], |
|
max_tokens=2048 |
|
) |
|
|
|
return response.choices[0].message.content |
|
|
|
except Exception as e: |
|
raise gr.Error(f"Error generating summary: {e}") |
|
|
|
def analyze_document(api_key, user_prompt, uploaded_file): |
|
"""Main processing function""" |
|
if not api_key: |
|
raise gr.Error("Please enter your OpenRouter API key") |
|
|
|
if uploaded_file is None: |
|
raise gr.Error("Please upload a document") |
|
|
|
images_to_analyze = [] |
|
file_ext = os.path.splitext(uploaded_file.name)[1].lower() |
|
|
|
|
|
if file_ext == '.pdf': |
|
with open(uploaded_file.name, "rb") as f: |
|
pdf_data = f.read() |
|
pdf_images = convert_pdf_to_images(pdf_data) |
|
images_to_analyze = pdf_images |
|
else: |
|
image = Image.open(uploaded_file.name) |
|
images_to_analyze = [image] |
|
|
|
|
|
all_results = [] |
|
extracted_texts = [] |
|
|
|
for idx, image in enumerate(images_to_analyze, 1): |
|
try: |
|
client = OpenAI( |
|
base_url="https://openrouter.ai/api/v1", |
|
api_key=api_key |
|
) |
|
|
|
image_base64 = image_to_base64(image) |
|
|
|
response = client.chat.completions.create( |
|
model="opengvlab/internvl3-14b:free", |
|
messages=[ |
|
{"role": "system", "content": "You are Dalton, an expert in understanding images that can analyze images and provide detailed descriptions."}, |
|
{"role": "user", "content": [ |
|
{"type": "text", "text": user_prompt}, |
|
{"type": "image_url", "image_url": { |
|
"url": f"data:image/png;base64,{image_base64}" |
|
}} |
|
]} |
|
], |
|
max_tokens=1024 |
|
) |
|
|
|
result = response.choices[0].message.content |
|
extracted_texts.append(f"=== Page {idx} ===\n{result}\n") |
|
all_results.append(f"π Page {idx} Result:\n{result}\n---\n") |
|
|
|
except Exception as e: |
|
raise gr.Error(f"Error analyzing page {idx}: {e}") |
|
|
|
|
|
final_output = "\n".join(all_results) |
|
|
|
if len(extracted_texts) > 1: |
|
summary = generate_summary("\n".join(extracted_texts), api_key) |
|
final_output += f"\nπ Comprehensive Summary:\n{summary}" |
|
|
|
return final_output |
|
|
|
|
|
with gr.Blocks(title="DocSum - Document Summarizer", theme=gr.themes.Soft()) as demo: |
|
gr.Markdown("# π§Ύ DocSum") |
|
gr.Markdown("Document Summarizer Powered by VLM β’ Developed by [Koshur AI](https://koshurai.com)") |
|
|
|
with gr.Row(): |
|
api_key = gr.Textbox( |
|
label="π OpenRouter API Key", |
|
type="password", |
|
placeholder="Enter your OpenRouter API key" |
|
) |
|
user_prompt = gr.Textbox( |
|
label="π Enter Your Prompt", |
|
value="Extract all content structurally", |
|
placeholder="What would you like to extract?" |
|
) |
|
|
|
uploaded_file = gr.File( |
|
label="Upload Document (PDF/Image)", |
|
file_types=[".pdf", ".jpg", ".jpeg", ".png"] |
|
) |
|
|
|
submit_btn = gr.Button("π Analyze Document", variant="primary") |
|
|
|
|
|
output = gr.Markdown( |
|
label="Analysis Results", |
|
elem_classes=["markdown-output"] |
|
) |
|
|
|
submit_btn.click( |
|
fn=analyze_document, |
|
inputs=[api_key, user_prompt, uploaded_file], |
|
outputs=output |
|
) |
|
|
|
|
|
css = """ |
|
.markdown-output { |
|
padding: 20px; |
|
border-radius: 8px; |
|
background: #f9fafb; |
|
border: 1px solid #e5e7eb; |
|
max-height: 600px; |
|
overflow-y: auto; |
|
} |
|
.markdown-output h2 { |
|
color: #2563eb; |
|
margin-top: 1.5em; |
|
margin-bottom: 0.5em; |
|
} |
|
.markdown-output h3 { |
|
color: #3b82f6; |
|
margin-top: 1em; |
|
} |
|
""" |
|
demo.css = css |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |