import os import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor from PIL import Image # Model ID MODEL_ID = "0llheaven/Llama-3.2-11B-Vision-Radiology-mini" # Load tokenizer and processor tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) processor = AutoProcessor.from_pretrained(MODEL_ID) # Load the model with reduced precision and memory optimizations print("Loading model with memory optimizations...") model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float16, # Use half precision device_map="auto", # Let the library decide how to map the model low_cpu_mem_usage=True, # Optimize CPU memory usage offload_folder="offload", # Offload weights to disk if needed offload_state_dict=True, # Enable state dict offloading trust_remote_code=True, ) print("Model loaded!") # Clear CUDA cache after loading if torch.cuda.is_available(): torch.cuda.empty_cache() def generate_response(image_file, prompt, max_new_tokens=256, temperature=0.7, top_p=0.9): try: # Process image if provided if image_file is not None: image = Image.open(image_file).convert('RGB') # Process inputs inputs = processor( text=prompt, images=image, return_tensors="pt" ) # Move inputs to the same device as model inputs = {k: v.to(model.device) for k, v in inputs.items() if isinstance(v, torch.Tensor)} # For safer generation, extract only what's needed input_ids = inputs.pop("input_ids", None) attention_mask = inputs.pop("attention_mask", None) # Generate response with conservative memory settings with torch.no_grad(): # Clear cache before generation if torch.cuda.is_available(): torch.cuda.empty_cache() outputs = model.generate( input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=True ) # Decode and return the response response = tokenizer.decode(outputs[0], skip_special_tokens=True) else: # Text-only input inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # Generate response with conservative memory settings with torch.no_grad(): # Clear cache before generation if torch.cuda.is_available(): torch.cuda.empty_cache() outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=True ) # Decode and return the response response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Remove the input prompt from the response if present if response.startswith(prompt): response = response[len(prompt):].strip() return response except Exception as e: return f"Error: {str(e)}" # Define the Gradio interface with gr.Blocks() as demo: gr.Markdown("# Llama-3.2-11B Vision Radiology Model") gr.Markdown("Upload a radiology image (X-ray, CT, MRI, etc.) and ask questions about it.") with gr.Row(): with gr.Column(): image_input = gr.Image(type="filepath", label="Upload Radiology Image") prompt_input = gr.Textbox(label="Question or Prompt", placeholder="Describe what you see in this image and identify any abnormalities.") with gr.Row(): max_tokens = gr.Slider(minimum=16, maximum=512, value=256, step=8, label="Max New Tokens") temperature = gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature") top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p") submit_btn = gr.Button("Generate Response") with gr.Column(): output = gr.Textbox(label="Model Response", lines=15) submit_btn.click( generate_response, inputs=[image_input, prompt_input, max_tokens, temperature, top_p], outputs=[output] ) gr.Examples( [ ["sample_xray.jpg", "What abnormalities do you see in this X-ray?"], ["sample_ct.jpg", "Describe this image and any findings."], ], inputs=[image_input, prompt_input], ) # Reduce maximum allowed concurrent users to conserve memory demo.launch(max_threads=1)