Spaces:

mknolan
/

internvl25-slide-analyzer-simple

Paused

File size: 7,676 Bytes

6bda7a2

import os
import sys
import torch
import tempfile
from PIL import Image
import gradio as gr
import pdf2image
from transformers import AutoModel, AutoTokenizer
import torchvision.transforms as transforms

# Configuration
MODEL_NAME = "OpenGVLab/InternVL2_5-8B"
IMAGE_SIZE = 448

# Model loading function
def load_model():
    print(f"\n=== Loading {MODEL_NAME} ===")
    print(f"CUDA available: {torch.cuda.is_available()}")
    
    # Set device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    
    # Load model and tokenizer with minimal options to avoid compatibility issues
    try:
        model = AutoModel.from_pretrained(
            MODEL_NAME,
            trust_remote_code=True,
            device_map="auto" if torch.cuda.is_available() else None
        )
        
        tokenizer = AutoTokenizer.from_pretrained(
            MODEL_NAME,
            use_fast=False,
            trust_remote_code=True
        )
        
        print(f"✓ Model and tokenizer loaded successfully!")
        return model, tokenizer
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        import traceback
        traceback.print_exc()
        return None, None

# Extract slides from uploaded PDF file
def extract_slides_from_pdf(file_obj):
    try:
        file_bytes = file_obj.read()
        file_extension = os.path.splitext(file_obj.name)[1].lower()
        
        # Check if it's a PDF
        if file_extension != '.pdf':
            return []
        
        # Create temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
            temp_file.write(file_bytes)
            temp_path = temp_file.name
        
        # Extract images from PDF using pdf2image
        slides = []
        try:
            images = pdf2image.convert_from_path(temp_path, dpi=300)
            slides = [(f"Slide {i+1}", img) for i, img in enumerate(images)]
        except Exception as e:
            print(f"Error converting PDF: {e}")
        
        # Clean up temporary file
        os.unlink(temp_path)
        
        return slides
    
    except Exception as e:
        import traceback
        error_msg = f"Error extracting slides: {str(e)}\n{traceback.format_exc()}"
        print(error_msg)
        return []

# Simple preprocessing for a single image
def preprocess_image(image):
    # Resize image to expected size
    img = image.resize((IMAGE_SIZE, IMAGE_SIZE))
    
    # Convert PIL image to tensor and normalize
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # Apply transformation and add batch dimension
    img_tensor = transform(img).unsqueeze(0)
    
    # Move tensor to GPU if available
    if torch.cuda.is_available():
        img_tensor = img_tensor.cuda()
        
    return img_tensor

# Image analysis function - using simple approach
def analyze_image(model, tokenizer, image, prompt):
    try:
        # Check if image is valid
        if image is None:
            return "Please upload an image first."
        
        # Process the image with simple preprocessing
        processed_image = preprocess_image(image)
        
        # Simple prompt format
        question = f"<image>\n{prompt}"
        
        # Use the model's chat method
        response, _ = model.chat(
            tokenizer=tokenizer,
            pixel_values=processed_image,
            question=question,
            history=None,
            return_history=True
        )
        
        return response
    except Exception as e:
        import traceback
        error_msg = f"Error analyzing image: {str(e)}\n{traceback.format_exc()}"
        return error_msg

# Analyze multiple slides from a PDF
def analyze_pdf_slides(model, tokenizer, file_obj, prompt, num_slides=2):
    try:
        if file_obj is None:
            return "Please upload a PDF file."
        
        # Extract slides from PDF
        slides = extract_slides_from_pdf(file_obj)
        
        if not slides:
            return "No slides were extracted from the file. Please check that it's a valid PDF."
        
        # Limit to the requested number of slides
        slides = slides[:num_slides]
        
        # Analyze each slide
        analyses = []
        for slide_title, slide_image in slides:
            analysis = analyze_image(model, tokenizer, slide_image, prompt)
            analyses.append((slide_title, analysis))
        
        # Format the results
        result = ""
        for slide_title, analysis in analyses:
            result += f"## {slide_title}\n\n{analysis}\n\n---\n\n"
        
        return result
    
    except Exception as e:
        import traceback
        error_msg = f"Error analyzing slides: {str(e)}\n{traceback.format_exc()}"
        return error_msg

# Main function
def main():
    # Load the model
    model, tokenizer = load_model()
    
    if model is None:
        # Create an error interface if model loading failed
        demo = gr.Interface(
            fn=lambda x: "Model loading failed. Please check the logs for details.",
            inputs=gr.Textbox(),
            outputs=gr.Textbox(),
            title="InternVL2.5 Slide Analyzer - Error",
            description="The model failed to load. Please check the logs for more information."
        )
        return demo
    
    # Create a simple interface
    with gr.Blocks(title="InternVL2.5 PDF Slide Analyzer") as demo:
        gr.Markdown("# InternVL2.5 PDF Slide Analyzer")
        gr.Markdown("Upload a PDF file and analyze multiple slides")
        
        # PDF Analysis tab
        slide_prompts = [
            "Analyze this slide and describe its contents.",
            "What is the main message of this slide?",
            "Extract all the text visible in this slide.",
            "What are the key points presented in this slide?",
            "Describe the visual elements and layout of this slide."
        ]
        
        with gr.Row():
            file_input = gr.File(label="Upload PDF")
            slide_prompt = gr.Dropdown(
                choices=slide_prompts, 
                value=slide_prompts[0], 
                label="Select a prompt",
                allow_custom_value=True
            )
        
        num_slides = gr.Slider(
            minimum=1, 
            maximum=5, 
            value=2, 
            step=1, 
            label="Number of Slides to Analyze"
        )
        
        slides_analyze_btn = gr.Button("Analyze Slides")
        slides_output = gr.Markdown(label="Analysis Results")
        
        # Handle the slides analysis action
        slides_analyze_btn.click(
            fn=lambda file, prompt, num: analyze_pdf_slides(model, tokenizer, file, prompt, num),
            inputs=[file_input, slide_prompt, num_slides],
            outputs=slides_output
        )
        
        # Add example if available
        if os.path.exists("example_slides/test_slides.pdf"):
            gr.Examples(
                examples=[
                    ["example_slides/test_slides.pdf", "Extract all the text visible in this slide.", 2]
                ],
                inputs=[file_input, slide_prompt, num_slides]
            )
    
    return demo

# Run the application
if __name__ == "__main__":
    try:
        # Create and launch the interface
        demo = main()
        demo.launch(server_name="0.0.0.0")
    except Exception as e:
        print(f"Error starting the application: {e}")
        import traceback
        traceback.print_exc()