import gradio as gr from openai import OpenAI import base64 from PIL import Image import io import fitz # PyMuPDF import tempfile import os # --- OPENAI CLIENT SETUP --- client = OpenAI( base_url="https://openrouter.ai/api/v1", api_key='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' ) def convert_pdf_to_images(pdf_file): """Convert PDF to list of PIL Images""" images = [] try: with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: tmp_file.write(pdf_file.read()) tmp_file_path = tmp_file.name pdf_document = fitz.open(tmp_file_path) for page_num in range(len(pdf_document)): page = pdf_document.load_page(page_num) pix = page.get_pixmap() img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) images.append(img) pdf_document.close() os.unlink(tmp_file_path) except Exception as e: return f"Error converting PDF: {e}" return images def image_to_base64(image): """Convert PIL Image to base64 string""" with io.BytesIO() as buffer: image.save(buffer, format="PNG") return base64.b64encode(buffer.getvalue()).decode("utf-8") def generate_summary(extracted_texts): """Generate a comprehensive summary of all extracted texts""" try: summary_prompt = f""" You are an expert document analyst. Below are the extracted contents from multiple pages of a document. Please provide a comprehensive, detailed summary that: 1. Organizes all key information logically 2. Identifies relationships between data points 3. Highlights important figures, dates, names 4. Presents the information in a clear, structured format Extracted contents from pages: {extracted_texts} Comprehensive Summary: """ response = client.chat.completions.create( model="opengvlab/internvl3-14b:free", messages=[ {"role": "system", "content": "You are Dalton, an expert in analyzing and summarizing document contents."}, {"role": "user", "content": summary_prompt} ], max_tokens=2048 ) return response.choices[0].message.content except Exception as e: return f"Error generating summary: {e}" def analyze_images(images, user_prompt, selected_pages=None): if not images: return "No images provided for analysis." if isinstance(images, str): # error message return images if selected_pages is None: selected_pages = list(range(1, len(images) + 1)) images_to_analyze = [images[i - 1] for i in selected_pages] all_results = [] extracted_texts = [] for idx, image in enumerate(images_to_analyze, 1): try: image_base64 = image_to_base64(image) response = client.chat.completions.create( model="opengvlab/internvl3-14b:free", messages=[ {"role": "system", "content": "You are Dalton, an expert in understanding images that can analyze images and provide detailed descriptions."}, {"role": "user", "content": [ {"type": "text", "text": user_prompt}, {"type": "image_url", "image_url": { "url": f"data:image/png;base64,{image_base64}" }} ]} ], max_tokens=1024 ) result = response.choices[0].message.content extracted_texts.append(f"=== Page {selected_pages[idx-1]} ===\n{result}\n") all_results.append(f"### ๐Ÿ“„ Page {selected_pages[idx-1]} Result:") all_results.append(result) all_results.append("---") except Exception as e: all_results.append(f"An error occurred analyzing page {selected_pages[idx-1]}: {e}") full_result = "\n".join(all_results) if len(extracted_texts) > 1: full_extracted_text = "\n".join(extracted_texts) summary = generate_summary(full_extracted_text) full_result += "\n\n## ๐Ÿ“ Comprehensive Document Summary\n" full_result += summary return full_result, summary elif len(extracted_texts) == 1: return full_result, None else: return "No valid results generated.", None def process_input(file, user_prompt, page_numbers): if file is None: return "Please upload a file.", None mime_type = file.type images = [] if mime_type == "application/pdf": images = convert_pdf_to_images(file) if isinstance(images, str): # error message return images, None page_options = list(range(1, len(images) + 1)) if not page_numbers or len(page_numbers) == 0: page_numbers = page_options return analyze_images(images, user_prompt, page_numbers) elif mime_type.startswith("image/"): images = [Image.open(file)] return analyze_images(images, user_prompt) else: return "Unsupported file type. Please upload a JPG/PNG/PDF.", None # --- GRADIO INTERFACE --- with gr.Blocks(title="DocSum - Document Summarizer") as demo: gr.Markdown("""

๐Ÿงพ DocSum

Document Summarizer Powered by VLM โ€ข Developed by Koshur AI

""") with gr.Row(): with gr.Column(): file_upload = gr.File(label="Upload a document (JPG/PNG/PDF)", file_types=[".jpg", ".jpeg", ".png", ".pdf"]) prompt = gr.Textbox(label="๐Ÿ“ Enter Your Prompt", value="Extract all content structurally") page_selector = gr.CheckboxGroup(label="Select Pages (for PDFs only)", choices=[], visible=False) def update_page_selector(file): if file and file.type == "application/pdf": with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: tmp_file.write(file.read()) tmp_file_path = tmp_file.name doc = fitz.open(tmp_file_path) num_pages = len(doc) doc.close() os.unlink(tmp_file_path) return gr.update(choices=list(range(1, num_pages + 1)), visible=True) else: return gr.update(choices=[], visible=False) file_upload.change(fn=update_page_selector, inputs=file_upload, outputs=page_selector) submit_btn = gr.Button("๐Ÿ” Analyze Document") with gr.Column(): output_box = gr.Markdown(label="Analysis Output") summary_download = gr.File(label="Download Summary", visible=False) def handle_submit(file, prompt, pages): result, summary = process_input(file, prompt, pages) summary_file = None if summary: with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as tmpfile: tmpfile.write(summary) summary_file = tmpfile.name return result, summary_file submit_btn.click(fn=handle_submit, inputs=[file_upload, prompt, page_selector], outputs=[output_box, summary_download]) gr.Markdown("") # Launch Gradio App demo.launch()