File size: 7,676 Bytes
6bda7a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
import os
import sys
import torch
import tempfile
from PIL import Image
import gradio as gr
import pdf2image
from transformers import AutoModel, AutoTokenizer
import torchvision.transforms as transforms

# Configuration
MODEL_NAME = "OpenGVLab/InternVL2_5-8B"
IMAGE_SIZE = 448

# Model loading function
def load_model():
    print(f"\n=== Loading {MODEL_NAME} ===")
    print(f"CUDA available: {torch.cuda.is_available()}")
    
    # Set device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    
    # Load model and tokenizer with minimal options to avoid compatibility issues
    try:
        model = AutoModel.from_pretrained(
            MODEL_NAME,
            trust_remote_code=True,
            device_map="auto" if torch.cuda.is_available() else None
        )
        
        tokenizer = AutoTokenizer.from_pretrained(
            MODEL_NAME,
            use_fast=False,
            trust_remote_code=True
        )
        
        print(f"βœ“ Model and tokenizer loaded successfully!")
        return model, tokenizer
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        import traceback
        traceback.print_exc()
        return None, None

# Extract slides from uploaded PDF file
def extract_slides_from_pdf(file_obj):
    try:
        file_bytes = file_obj.read()
        file_extension = os.path.splitext(file_obj.name)[1].lower()
        
        # Check if it's a PDF
        if file_extension != '.pdf':
            return []
        
        # Create temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
            temp_file.write(file_bytes)
            temp_path = temp_file.name
        
        # Extract images from PDF using pdf2image
        slides = []
        try:
            images = pdf2image.convert_from_path(temp_path, dpi=300)
            slides = [(f"Slide {i+1}", img) for i, img in enumerate(images)]
        except Exception as e:
            print(f"Error converting PDF: {e}")
        
        # Clean up temporary file
        os.unlink(temp_path)
        
        return slides
    
    except Exception as e:
        import traceback
        error_msg = f"Error extracting slides: {str(e)}\n{traceback.format_exc()}"
        print(error_msg)
        return []

# Simple preprocessing for a single image
def preprocess_image(image):
    # Resize image to expected size
    img = image.resize((IMAGE_SIZE, IMAGE_SIZE))
    
    # Convert PIL image to tensor and normalize
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # Apply transformation and add batch dimension
    img_tensor = transform(img).unsqueeze(0)
    
    # Move tensor to GPU if available
    if torch.cuda.is_available():
        img_tensor = img_tensor.cuda()
        
    return img_tensor

# Image analysis function - using simple approach
def analyze_image(model, tokenizer, image, prompt):
    try:
        # Check if image is valid
        if image is None:
            return "Please upload an image first."
        
        # Process the image with simple preprocessing
        processed_image = preprocess_image(image)
        
        # Simple prompt format
        question = f"<image>\n{prompt}"
        
        # Use the model's chat method
        response, _ = model.chat(
            tokenizer=tokenizer,
            pixel_values=processed_image,
            question=question,
            history=None,
            return_history=True
        )
        
        return response
    except Exception as e:
        import traceback
        error_msg = f"Error analyzing image: {str(e)}\n{traceback.format_exc()}"
        return error_msg

# Analyze multiple slides from a PDF
def analyze_pdf_slides(model, tokenizer, file_obj, prompt, num_slides=2):
    try:
        if file_obj is None:
            return "Please upload a PDF file."
        
        # Extract slides from PDF
        slides = extract_slides_from_pdf(file_obj)
        
        if not slides:
            return "No slides were extracted from the file. Please check that it's a valid PDF."
        
        # Limit to the requested number of slides
        slides = slides[:num_slides]
        
        # Analyze each slide
        analyses = []
        for slide_title, slide_image in slides:
            analysis = analyze_image(model, tokenizer, slide_image, prompt)
            analyses.append((slide_title, analysis))
        
        # Format the results
        result = ""
        for slide_title, analysis in analyses:
            result += f"## {slide_title}\n\n{analysis}\n\n---\n\n"
        
        return result
    
    except Exception as e:
        import traceback
        error_msg = f"Error analyzing slides: {str(e)}\n{traceback.format_exc()}"
        return error_msg

# Main function
def main():
    # Load the model
    model, tokenizer = load_model()
    
    if model is None:
        # Create an error interface if model loading failed
        demo = gr.Interface(
            fn=lambda x: "Model loading failed. Please check the logs for details.",
            inputs=gr.Textbox(),
            outputs=gr.Textbox(),
            title="InternVL2.5 Slide Analyzer - Error",
            description="The model failed to load. Please check the logs for more information."
        )
        return demo
    
    # Create a simple interface
    with gr.Blocks(title="InternVL2.5 PDF Slide Analyzer") as demo:
        gr.Markdown("# InternVL2.5 PDF Slide Analyzer")
        gr.Markdown("Upload a PDF file and analyze multiple slides")
        
        # PDF Analysis tab
        slide_prompts = [
            "Analyze this slide and describe its contents.",
            "What is the main message of this slide?",
            "Extract all the text visible in this slide.",
            "What are the key points presented in this slide?",
            "Describe the visual elements and layout of this slide."
        ]
        
        with gr.Row():
            file_input = gr.File(label="Upload PDF")
            slide_prompt = gr.Dropdown(
                choices=slide_prompts, 
                value=slide_prompts[0], 
                label="Select a prompt",
                allow_custom_value=True
            )
        
        num_slides = gr.Slider(
            minimum=1, 
            maximum=5, 
            value=2, 
            step=1, 
            label="Number of Slides to Analyze"
        )
        
        slides_analyze_btn = gr.Button("Analyze Slides")
        slides_output = gr.Markdown(label="Analysis Results")
        
        # Handle the slides analysis action
        slides_analyze_btn.click(
            fn=lambda file, prompt, num: analyze_pdf_slides(model, tokenizer, file, prompt, num),
            inputs=[file_input, slide_prompt, num_slides],
            outputs=slides_output
        )
        
        # Add example if available
        if os.path.exists("example_slides/test_slides.pdf"):
            gr.Examples(
                examples=[
                    ["example_slides/test_slides.pdf", "Extract all the text visible in this slide.", 2]
                ],
                inputs=[file_input, slide_prompt, num_slides]
            )
    
    return demo

# Run the application
if __name__ == "__main__":
    try:
        # Create and launch the interface
        demo = main()
        demo.launch(server_name="0.0.0.0")
    except Exception as e:
        print(f"Error starting the application: {e}")
        import traceback
        traceback.print_exc()