Spaces:

Pruthvi369i
/

ProjectExpo

Runtime error

App Files Files Community

Pruthvi369i commited on Mar 29

Commit

6a21530

verified ·

1 Parent(s): 867a39c

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -27

app.py CHANGED Viewed

@@ -1,54 +1,59 @@
 import os
 import gradio as gr
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor, BitsAndBytesConfig
 from PIL import Image
 # Model ID
 MODEL_ID = "0llheaven/Llama-3.2-11B-Vision-Radiology-mini"
-# Configure 4-bit quantization
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.float16,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_use_double_quant=True
-)
 # Load tokenizer and processor
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 processor = AutoProcessor.from_pretrained(MODEL_ID)
-# Load the model with quantization
-print("Loading model with 4-bit quantization...")
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
-    quantization_config=quantization_config,
-    device_map="auto",
     trust_remote_code=True,
 )
 print("Model loaded!")
-def generate_response(image_file, prompt, max_new_tokens=512, temperature=0.7, top_p=0.9):
     try:
         # Process image if provided
         if image_file is not None:
             image = Image.open(image_file).convert('RGB')
-            # Process inputs with processor
             inputs = processor(
                 text=prompt,
                 images=image,
                 return_tensors="pt"
-            ).to(model.device)
-            # For multimodal models, we need to handle the inputs differently
-            # Extract only the input_ids and attention_mask for generation
-            input_ids = inputs.get("input_ids")
-            attention_mask = inputs.get("attention_mask", None)
-            # Generate response
             with torch.no_grad():
                 outputs = model.generate(
                     input_ids=input_ids,
                     attention_mask=attention_mask,
@@ -65,8 +70,12 @@ def generate_response(image_file, prompt, max_new_tokens=512, temperature=0.7, t
             # Text-only input
             inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-            # Generate response
             with torch.no_grad():
                 outputs = model.generate(
                     **inputs,
                     max_new_tokens=max_new_tokens,
@@ -78,7 +87,7 @@ def generate_response(image_file, prompt, max_new_tokens=512, temperature=0.7, t
             # Decode and return the response
             response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Remove the input prompt from the response
         if response.startswith(prompt):
             response = response[len(prompt):].strip()
@@ -98,7 +107,7 @@ with gr.Blocks() as demo:
             prompt_input = gr.Textbox(label="Question or Prompt", placeholder="Describe what you see in this image and identify any abnormalities.")
             with gr.Row():
-                max_tokens = gr.Slider(minimum=16, maximum=1024, value=512, step=8, label="Max New Tokens")
                 temperature = gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
                 top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p")
@@ -115,10 +124,11 @@ with gr.Blocks() as demo:
     gr.Examples(
         [
-            ["sample_xray.jpg", "Describe what you see in this chest X-ray and identify any abnormalities."],
-            ["sample_ct.jpg", "Analyze this CT scan and provide a detailed report."],
         ],
         inputs=[image_input, prompt_input],
     )
-demo.launch()

 import os
 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
 from PIL import Image
 # Model ID
 MODEL_ID = "0llheaven/Llama-3.2-11B-Vision-Radiology-mini"
 # Load tokenizer and processor
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 processor = AutoProcessor.from_pretrained(MODEL_ID)
+# Load the model with reduced precision and memory optimizations
+print("Loading model with memory optimizations...")
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
+    torch_dtype=torch.float16,  # Use half precision
+    device_map="auto",          # Let the library decide how to map the model
+    low_cpu_mem_usage=True,     # Optimize CPU memory usage
+    offload_folder="offload",   # Offload weights to disk if needed
+    offload_state_dict=True,    # Enable state dict offloading
     trust_remote_code=True,
 )
 print("Model loaded!")
+# Clear CUDA cache after loading
+if torch.cuda.is_available():
+    torch.cuda.empty_cache()
+def generate_response(image_file, prompt, max_new_tokens=256, temperature=0.7, top_p=0.9):
     try:
         # Process image if provided
         if image_file is not None:
             image = Image.open(image_file).convert('RGB')
+            # Process inputs
             inputs = processor(
                 text=prompt,
                 images=image,
                 return_tensors="pt"
+            )
+            # Move inputs to the same device as model
+            inputs = {k: v.to(model.device) for k, v in inputs.items() if isinstance(v, torch.Tensor)}
+            # For safer generation, extract only what's needed
+            input_ids = inputs.pop("input_ids", None)
+            attention_mask = inputs.pop("attention_mask", None)
+            # Generate response with conservative memory settings
             with torch.no_grad():
+                # Clear cache before generation
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
                 outputs = model.generate(
                     input_ids=input_ids,
                     attention_mask=attention_mask,
             # Text-only input
             inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+            # Generate response with conservative memory settings
             with torch.no_grad():
+                # Clear cache before generation
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
                 outputs = model.generate(
                     **inputs,
                     max_new_tokens=max_new_tokens,
             # Decode and return the response
             response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Remove the input prompt from the response if present
         if response.startswith(prompt):
             response = response[len(prompt):].strip()
             prompt_input = gr.Textbox(label="Question or Prompt", placeholder="Describe what you see in this image and identify any abnormalities.")
             with gr.Row():
+                max_tokens = gr.Slider(minimum=16, maximum=512, value=256, step=8, label="Max New Tokens")
                 temperature = gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
                 top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p")
     gr.Examples(
         [
+            ["sample_xray.jpg", "What abnormalities do you see in this X-ray?"],
+            ["sample_ct.jpg", "Describe this image and any findings."],
         ],
         inputs=[image_input, prompt_input],
     )
+# Reduce maximum allowed concurrent users to conserve memory
+demo.launch(max_threads=1)