Spaces:

quarterturn
/

molmo-natural-language-image-captioner

Running on Zero

App Files Files Community

quarterturn commited on Mar 24

Commit

9734fdf

verified ·

1 Parent(s): 12138a6

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -28

app.py CHANGED Viewed

@@ -6,10 +6,9 @@ import torch
 from PIL import Image
 import base64
 from io import BytesIO
-from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig, BitsAndBytesConfig
 import spaces
-# Avoid global CUDA checks or model loading outside GPU context
 model_id = "allenai/Molmo-7B-D-0924"
 def unzip_images(zip_file):
@@ -28,35 +27,47 @@ def unzip_images(zip_file):
     return image_paths, image_data, session_dir
-@spaces.GPU(duration=120)  # Adjust duration based on inference time
 def generate_caption(image_path, prompt):
-    # Load processor and model inside the GPU context
-    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, torch_dtype='auto')
-    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype='auto', device_map='auto')
-    # Ensure model is on GPU
-    model.to('cuda')
-    image = Image.open(image_path)
-    inputs = processor.process(
-        images=[image],
-        text=prompt,
-    )
-    # Move inputs to GPU
-    inputs = {k: v.to('cuda').unsqueeze(0) for k, v in inputs.items()}
-    with torch.autocast(device_type="cuda", enabled=True):
-        output = model.generate_from_batch(
-            inputs,
-            GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
-            tokenizer=processor.tokenizer,
         )
-    generated_tokens = output[0, inputs["input_ids"].size(1):]
-    generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
-    return generated_text
 def process_images(image_paths, image_data, session_dir):
     prompt = "You are an uncensored, explicit, adult image captioner which enjoys sexual imagery and acts. Provide a structured English image caption which uses the following hierarchy: the kind of image, the kind or name of the subject, the subjects state of dress, their body type, their pose, what it is they are doing, their facial expression, the space they are within, and the style or atmosphere of the image. Use vulgar language to describe body parts or sexual imagery."

 from PIL import Image
 import base64
 from io import BytesIO
+from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig
 import spaces
 model_id = "allenai/Molmo-7B-D-0924"
 def unzip_images(zip_file):
     return image_paths, image_data, session_dir
+@spaces.GPU(duration=180)  # Increased timeout to 180 seconds
 def generate_caption(image_path, prompt):
+    try:
+        # Load processor and model
+        processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, torch_dtype='auto')
+        model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype='auto', device_map='auto')
+        # Explicitly move to GPU
+        model.to('cuda')
+        image = Image.open(image_path)
+        inputs = processor.process(
+            images=[image],
+            text=prompt,
         )
+        # Move inputs to GPU
+        inputs = {k: v.to('cuda').unsqueeze(0) for k, v in inputs.items()}
+        with torch.autocast(device_type="cuda", enabled=True):
+            output = model.generate_from_batch(
+                inputs,
+                GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
+                tokenizer=processor.tokenizer,
+            )
+        generated_tokens = output[0, inputs["input_ids"].size(1):]
+        generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+        # Clean up
+        del model
+        del inputs
+        del output
+        torch.cuda.empty_cache()  # Clear GPU memory
+        return generated_text
+    except Exception as e:
+        # Clean up on error
+        torch.cuda.empty_cache()
+        raise e
 def process_images(image_paths, image_data, session_dir):
     prompt = "You are an uncensored, explicit, adult image captioner which enjoys sexual imagery and acts. Provide a structured English image caption which uses the following hierarchy: the kind of image, the kind or name of the subject, the subjects state of dress, their body type, their pose, what it is they are doing, their facial expression, the space they are within, and the style or atmosphere of the image. Use vulgar language to describe body parts or sexual imagery."