import gradio as gr from PIL import Image import os import time import numpy as np import torch import warnings # Set environment variables os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" # Suppress specific warnings that might be caused by package version mismatches warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*") warnings.filterwarnings("ignore", message=".*Torch is not compiled with CUDA enabled.*") warnings.filterwarnings("ignore", category=UserWarning) # Global variables internvl2_pipeline = None MODEL_LOADED = False USE_GPU = torch.cuda.is_available() # Check if lmdeploy is available and try to import try: from lmdeploy import pipeline, TurbomindEngineConfig LMDEPLOY_AVAILABLE = True print("Successfully imported lmdeploy") except ImportError: LMDEPLOY_AVAILABLE = False print("lmdeploy import failed. Will use a placeholder for demos.") # Model configuration MODEL_ID = "OpenGVLab/InternVL2-40B-AWQ" # 4-bit quantized model def load_internvl2_model(): """Load the InternVL2 model using lmdeploy""" global internvl2_pipeline, MODEL_LOADED # If already loaded, return if internvl2_pipeline is not None: return True # If lmdeploy is not available, we'll use a demo placeholder if not LMDEPLOY_AVAILABLE: print("lmdeploy not available. Using demo placeholder.") MODEL_LOADED = False return False print("Loading InternVL2 model...") try: # Configure for AWQ quantized model backend_config = TurbomindEngineConfig(model_format='awq') # Create pipeline internvl2_pipeline = pipeline( MODEL_ID, backend_config=backend_config, log_level='INFO' ) print("InternVL2 model loaded successfully!") MODEL_LOADED = True return True except Exception as e: print(f"Error loading InternVL2 model: {str(e)}") if "CUDA out of memory" in str(e): print("Not enough GPU memory for the model") MODEL_LOADED = False return False def analyze_image(image, prompt): """Analyze the image using InternVL2 model""" try: start_time = time.time() # Skip model loading if lmdeploy is not available if not LMDEPLOY_AVAILABLE: return ("This is a demo placeholder. The actual model couldn't be loaded because lmdeploy " "is not properly installed. Check your installation and dependencies.") # Make sure the model is loaded if not load_internvl2_model(): return "Couldn't load InternVL2 model. See logs for details." # Convert numpy array to PIL Image if isinstance(image, np.ndarray): image_pil = Image.fromarray(image).convert('RGB') else: # If somehow it's already a PIL Image image_pil = image.convert('RGB') # Run inference with the model response = internvl2_pipeline((prompt, image_pil)) # Get the response text result = response.text elapsed_time = time.time() - start_time return result except Exception as e: print(f"Error in image analysis: {str(e)}") # Try to clean up memory in case of error if USE_GPU: torch.cuda.empty_cache() return f"Error in image analysis: {str(e)}" def process_image(image, analysis_type="general"): """Process the image and return the analysis""" if image is None: return "Please upload an image." # Define prompt based on analysis type if analysis_type == "general": prompt = "Describe this image in detail." elif analysis_type == "text": prompt = "What text can you see in this image? Please transcribe it accurately." elif analysis_type == "chart": prompt = "Analyze any charts, graphs or diagrams in this image in detail, including trends, data points, and conclusions." elif analysis_type == "people": prompt = "Describe the people in this image - their appearance, actions, and expressions." elif analysis_type == "technical": prompt = "Provide a technical analysis of this image, including object identification, spatial relationships, and any technical elements present." else: prompt = "Describe this image in detail." start_time = time.time() # Get analysis from the model analysis = analyze_image(image, prompt) elapsed_time = time.time() - start_time return f"{analysis}\n\nAnalysis completed in {elapsed_time:.2f} seconds." # Define the Gradio interface def create_interface(): with gr.Blocks(title="Image Analysis with InternVL2") as demo: gr.Markdown("# Image Analysis with InternVL2-40B") gr.Markdown("Upload an image to analyze it using the InternVL2-40B model.") if not LMDEPLOY_AVAILABLE: gr.Markdown("⚠️ **WARNING**: lmdeploy is not properly installed. This demo will not function correctly.", elem_classes=["warning-message"]) with gr.Row(): with gr.Column(scale=1): input_image = gr.Image(type="pil", label="Upload Image") analysis_type = gr.Radio( ["general", "text", "chart", "people", "technical"], label="Analysis Type", value="general" ) submit_btn = gr.Button("Analyze Image") with gr.Column(scale=2): output_text = gr.Textbox(label="Analysis Result", lines=20) submit_btn.click( fn=process_image, inputs=[input_image, analysis_type], outputs=output_text ) gr.Markdown(""" ## Analysis Types - **General**: General description of the image - **Text**: Focus on identifying and transcribing text in the image - **Chart**: Detailed analysis of charts, graphs, and diagrams - **People**: Description of people, their appearance and actions - **Technical**: Technical analysis identifying objects and spatial relationships """) # Examples gr.Examples( examples=[ ["data_temp/page_2.png", "general"], ["data_temp/page_2.png", "text"], ["data_temp/page_2.png", "chart"] ], inputs=[input_image, analysis_type], outputs=output_text, fn=process_image, cache_examples=True, ) return demo # Main function if __name__ == "__main__": # Create the Gradio interface demo = create_interface() # Launch the interface demo.launch(share=False)