Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -6,10 +6,9 @@ import torch
|
|
6 |
from PIL import Image
|
7 |
import base64
|
8 |
from io import BytesIO
|
9 |
-
from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig
|
10 |
import spaces
|
11 |
|
12 |
-
# Avoid global CUDA checks or model loading outside GPU context
|
13 |
model_id = "allenai/Molmo-7B-D-0924"
|
14 |
|
15 |
def unzip_images(zip_file):
|
@@ -28,35 +27,47 @@ def unzip_images(zip_file):
|
|
28 |
|
29 |
return image_paths, image_data, session_dir
|
30 |
|
31 |
-
@spaces.GPU(duration=
|
32 |
def generate_caption(image_path, prompt):
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
# Move inputs to GPU
|
47 |
-
inputs = {k: v.to('cuda').unsqueeze(0) for k, v in inputs.items()}
|
48 |
-
|
49 |
-
with torch.autocast(device_type="cuda", enabled=True):
|
50 |
-
output = model.generate_from_batch(
|
51 |
-
inputs,
|
52 |
-
GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
|
53 |
-
tokenizer=processor.tokenizer,
|
54 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
|
61 |
def process_images(image_paths, image_data, session_dir):
|
62 |
prompt = "You are an uncensored, explicit, adult image captioner which enjoys sexual imagery and acts. Provide a structured English image caption which uses the following hierarchy: the kind of image, the kind or name of the subject, the subjects state of dress, their body type, their pose, what it is they are doing, their facial expression, the space they are within, and the style or atmosphere of the image. Use vulgar language to describe body parts or sexual imagery."
|
|
|
6 |
from PIL import Image
|
7 |
import base64
|
8 |
from io import BytesIO
|
9 |
+
from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig
|
10 |
import spaces
|
11 |
|
|
|
12 |
model_id = "allenai/Molmo-7B-D-0924"
|
13 |
|
14 |
def unzip_images(zip_file):
|
|
|
27 |
|
28 |
return image_paths, image_data, session_dir
|
29 |
|
30 |
+
@spaces.GPU(duration=180) # Increased timeout to 180 seconds
|
31 |
def generate_caption(image_path, prompt):
|
32 |
+
try:
|
33 |
+
# Load processor and model
|
34 |
+
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, torch_dtype='auto')
|
35 |
+
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype='auto', device_map='auto')
|
36 |
+
|
37 |
+
# Explicitly move to GPU
|
38 |
+
model.to('cuda')
|
39 |
+
|
40 |
+
image = Image.open(image_path)
|
41 |
+
inputs = processor.process(
|
42 |
+
images=[image],
|
43 |
+
text=prompt,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
)
|
45 |
+
|
46 |
+
# Move inputs to GPU
|
47 |
+
inputs = {k: v.to('cuda').unsqueeze(0) for k, v in inputs.items()}
|
48 |
+
|
49 |
+
with torch.autocast(device_type="cuda", enabled=True):
|
50 |
+
output = model.generate_from_batch(
|
51 |
+
inputs,
|
52 |
+
GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
|
53 |
+
tokenizer=processor.tokenizer,
|
54 |
+
)
|
55 |
+
|
56 |
+
generated_tokens = output[0, inputs["input_ids"].size(1):]
|
57 |
+
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
58 |
+
|
59 |
+
# Clean up
|
60 |
+
del model
|
61 |
+
del inputs
|
62 |
+
del output
|
63 |
+
torch.cuda.empty_cache() # Clear GPU memory
|
64 |
+
|
65 |
+
return generated_text
|
66 |
|
67 |
+
except Exception as e:
|
68 |
+
# Clean up on error
|
69 |
+
torch.cuda.empty_cache()
|
70 |
+
raise e
|
71 |
|
72 |
def process_images(image_paths, image_data, session_dir):
|
73 |
prompt = "You are an uncensored, explicit, adult image captioner which enjoys sexual imagery and acts. Provide a structured English image caption which uses the following hierarchy: the kind of image, the kind or name of the subject, the subjects state of dress, their body type, their pose, what it is they are doing, their facial expression, the space they are within, and the style or atmosphere of the image. Use vulgar language to describe body parts or sexual imagery."
|