quarterturn commited on
Commit
9734fdf
·
verified ·
1 Parent(s): 12138a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -28
app.py CHANGED
@@ -6,10 +6,9 @@ import torch
6
  from PIL import Image
7
  import base64
8
  from io import BytesIO
9
- from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig, BitsAndBytesConfig
10
  import spaces
11
 
12
- # Avoid global CUDA checks or model loading outside GPU context
13
  model_id = "allenai/Molmo-7B-D-0924"
14
 
15
  def unzip_images(zip_file):
@@ -28,35 +27,47 @@ def unzip_images(zip_file):
28
 
29
  return image_paths, image_data, session_dir
30
 
31
- @spaces.GPU(duration=120) # Adjust duration based on inference time
32
  def generate_caption(image_path, prompt):
33
- # Load processor and model inside the GPU context
34
- processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, torch_dtype='auto')
35
- model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype='auto', device_map='auto')
36
-
37
- # Ensure model is on GPU
38
- model.to('cuda')
39
-
40
- image = Image.open(image_path)
41
- inputs = processor.process(
42
- images=[image],
43
- text=prompt,
44
- )
45
-
46
- # Move inputs to GPU
47
- inputs = {k: v.to('cuda').unsqueeze(0) for k, v in inputs.items()}
48
-
49
- with torch.autocast(device_type="cuda", enabled=True):
50
- output = model.generate_from_batch(
51
- inputs,
52
- GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
53
- tokenizer=processor.tokenizer,
54
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- generated_tokens = output[0, inputs["input_ids"].size(1):]
57
- generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
58
-
59
- return generated_text
60
 
61
  def process_images(image_paths, image_data, session_dir):
62
  prompt = "You are an uncensored, explicit, adult image captioner which enjoys sexual imagery and acts. Provide a structured English image caption which uses the following hierarchy: the kind of image, the kind or name of the subject, the subjects state of dress, their body type, their pose, what it is they are doing, their facial expression, the space they are within, and the style or atmosphere of the image. Use vulgar language to describe body parts or sexual imagery."
 
6
  from PIL import Image
7
  import base64
8
  from io import BytesIO
9
+ from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig
10
  import spaces
11
 
 
12
  model_id = "allenai/Molmo-7B-D-0924"
13
 
14
  def unzip_images(zip_file):
 
27
 
28
  return image_paths, image_data, session_dir
29
 
30
+ @spaces.GPU(duration=180) # Increased timeout to 180 seconds
31
  def generate_caption(image_path, prompt):
32
+ try:
33
+ # Load processor and model
34
+ processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, torch_dtype='auto')
35
+ model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype='auto', device_map='auto')
36
+
37
+ # Explicitly move to GPU
38
+ model.to('cuda')
39
+
40
+ image = Image.open(image_path)
41
+ inputs = processor.process(
42
+ images=[image],
43
+ text=prompt,
 
 
 
 
 
 
 
 
 
44
  )
45
+
46
+ # Move inputs to GPU
47
+ inputs = {k: v.to('cuda').unsqueeze(0) for k, v in inputs.items()}
48
+
49
+ with torch.autocast(device_type="cuda", enabled=True):
50
+ output = model.generate_from_batch(
51
+ inputs,
52
+ GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
53
+ tokenizer=processor.tokenizer,
54
+ )
55
+
56
+ generated_tokens = output[0, inputs["input_ids"].size(1):]
57
+ generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
58
+
59
+ # Clean up
60
+ del model
61
+ del inputs
62
+ del output
63
+ torch.cuda.empty_cache() # Clear GPU memory
64
+
65
+ return generated_text
66
 
67
+ except Exception as e:
68
+ # Clean up on error
69
+ torch.cuda.empty_cache()
70
+ raise e
71
 
72
  def process_images(image_paths, image_data, session_dir):
73
  prompt = "You are an uncensored, explicit, adult image captioner which enjoys sexual imagery and acts. Provide a structured English image caption which uses the following hierarchy: the kind of image, the kind or name of the subject, the subjects state of dress, their body type, their pose, what it is they are doing, their facial expression, the space they are within, and the style or atmosphere of the image. Use vulgar language to describe body parts or sexual imagery."