Spaces:
Running
Running
Rishi Desai
commited on
Commit
·
ebbd273
1
Parent(s):
d8dc4ed
removed hardcoded trigger word
Browse files- caption.py +20 -20
caption.py
CHANGED
@@ -3,6 +3,7 @@ import io
|
|
3 |
import os
|
4 |
from together import Together
|
5 |
|
|
|
6 |
|
7 |
def get_system_prompt():
|
8 |
return """Automated Image Captioning (for LoRA Training)
|
@@ -10,9 +11,8 @@ def get_system_prompt():
|
|
10 |
Role: You are an expert AI captioning system generating precise, structured descriptions for character images optimized for LoRA model training in Stable Diffusion and Flux.1-dev.
|
11 |
|
12 |
IMPORTANT: You MUST follow these rules EXACTLY:
|
13 |
-
1. EVERY caption MUST start with the word "
|
14 |
-
2. You MUST use the exact format:
|
15 |
-
3. DO NOT include any additional text, explanations, or formatting
|
16 |
4. DO NOT use bullet points, lists, or any other formatting
|
17 |
5. DO NOT include any text before or after the caption
|
18 |
6. If you don't follow this format exactly, the caption will be rejected
|
@@ -34,7 +34,7 @@ Avoid Describing These Unless Variable Across Dataset or Uncertain from Concept:
|
|
34 |
- Known accessories that always appear (unless outfit-specific)
|
35 |
|
36 |
Caption Format (MUST FOLLOW EXACTLY):
|
37 |
-
|
38 |
|
39 |
Captioning Principles:
|
40 |
- Emphasize visual variation and context-specific details (outfit, pose, lighting, expression, camera angle).
|
@@ -44,15 +44,15 @@ Captioning Principles:
|
|
44 |
- Specify lighting conditions (soft lighting, harsh shadows, glowing backlight).
|
45 |
- Explicitly state camera angle (e.g., front view, right side profile, low-angle, high-angle, overhead).
|
46 |
- Avoid mentioning real or fictional identities.
|
47 |
-
- Always prefix with the trigger word "
|
48 |
|
49 |
Examples (MUST FOLLOW THIS EXACT FORMAT):
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
|
55 |
-
REMEMBER: Your response must be a single line starting with "
|
56 |
"""
|
57 |
|
58 |
|
@@ -82,10 +82,10 @@ def get_together_client():
|
|
82 |
|
83 |
def extract_caption(line):
|
84 |
"""Extract caption from a line of text."""
|
85 |
-
if
|
86 |
-
# If caption doesn't start with
|
87 |
-
if not line.startswith(
|
88 |
-
return line[line.index(
|
89 |
return line
|
90 |
return ""
|
91 |
|
@@ -117,7 +117,7 @@ def caption_single_image(client, img_str):
|
|
117 |
break
|
118 |
|
119 |
if not caption:
|
120 |
-
error_msg = "Failed to extract a valid caption (containing '
|
121 |
error_msg += f"\n\nActual response:\n{full_response}"
|
122 |
raise CaptioningError(error_msg)
|
123 |
|
@@ -155,8 +155,8 @@ def process_batch_response(response, image_strings):
|
|
155 |
image_count = len(image_strings)
|
156 |
captions = [""] * image_count
|
157 |
|
158 |
-
# Extract lines that start with or contain
|
159 |
-
caption_lines = [line for line in lines if
|
160 |
|
161 |
# Assign captions to images
|
162 |
for i in range(image_count):
|
@@ -171,7 +171,7 @@ def process_batch_response(response, image_strings):
|
|
171 |
def validate_batch_captions(captions, image_count, full_response):
|
172 |
"""Validate captions extracted from a batch response."""
|
173 |
# Check if all captions are empty or don't contain the trigger word
|
174 |
-
valid_captions = [c for c in captions if c and
|
175 |
if not valid_captions:
|
176 |
error_msg = "Failed to parse any valid captions from batch response."
|
177 |
error_msg += f"\n\nActual response:\n{full_response}"
|
@@ -180,7 +180,7 @@ def validate_batch_captions(captions, image_count, full_response):
|
|
180 |
# Check if some captions are missing
|
181 |
if len(valid_captions) < image_count:
|
182 |
missing_count = image_count - len(valid_captions)
|
183 |
-
invalid_captions = [(i, c) for i, c in enumerate(captions) if not c or
|
184 |
error_msg = f"Failed to parse captions for {missing_count} of {image_count} images in batch mode"
|
185 |
error_msg += "\n\nMalformed captions:"
|
186 |
for idx, caption in invalid_captions:
|
@@ -204,6 +204,6 @@ def extract_captions(file_path):
|
|
204 |
captions = []
|
205 |
with open(file_path, 'r') as file:
|
206 |
for line in file:
|
207 |
-
if line.startswith(
|
208 |
captions.append(line.strip())
|
209 |
return captions
|
|
|
3 |
import os
|
4 |
from together import Together
|
5 |
|
6 |
+
TRIGGER_WORD = "tr1gger"
|
7 |
|
8 |
def get_system_prompt():
|
9 |
return """Automated Image Captioning (for LoRA Training)
|
|
|
11 |
Role: You are an expert AI captioning system generating precise, structured descriptions for character images optimized for LoRA model training in Stable Diffusion and Flux.1-dev.
|
12 |
|
13 |
IMPORTANT: You MUST follow these rules EXACTLY:
|
14 |
+
1. EVERY caption MUST start with the word "{TRIGGER_WORD}" (exactly like this, no variations)
|
15 |
+
2. You MUST use the exact format: {TRIGGER_WORD} [Style], [Notable Visual Features], [Clothing], [Pose], [Expression/Mood], [Background/Setting], [Lighting], [Camera Angle]
|
|
|
16 |
4. DO NOT use bullet points, lists, or any other formatting
|
17 |
5. DO NOT include any text before or after the caption
|
18 |
6. If you don't follow this format exactly, the caption will be rejected
|
|
|
34 |
- Known accessories that always appear (unless outfit-specific)
|
35 |
|
36 |
Caption Format (MUST FOLLOW EXACTLY):
|
37 |
+
{TRIGGER_WORD} [Style], [Notable Visual Features], [Clothing], [Pose], [Expression/Mood], [Background/Setting], [Lighting], [Camera Angle]
|
38 |
|
39 |
Captioning Principles:
|
40 |
- Emphasize visual variation and context-specific details (outfit, pose, lighting, expression, camera angle).
|
|
|
44 |
- Specify lighting conditions (soft lighting, harsh shadows, glowing backlight).
|
45 |
- Explicitly state camera angle (e.g., front view, right side profile, low-angle, high-angle, overhead).
|
46 |
- Avoid mentioning real or fictional identities.
|
47 |
+
- Always prefix with the trigger word "{TRIGGER_WORD}."
|
48 |
|
49 |
Examples (MUST FOLLOW THIS EXACT FORMAT):
|
50 |
+
{TRIGGER_WORD} photorealistic, combat gear, tactical vest and gloves, standing in profile, neutral, empty room, overcast lighting, side profile
|
51 |
+
{TRIGGER_WORD} 3D-rendered, digital patterns, hooded cloak, seated cross-legged, calm, meditation chamber, low ambient lighting, front view
|
52 |
+
{TRIGGER_WORD} anime-style, school uniform with blue necktie, standing with arms behind back, gentle smile, classroom, soft daylight, three-quarter view
|
53 |
+
{TRIGGER_WORD} photorealistic, long trench coat and combat boots, walking, determined, rain-soaked street, dramatic shadows, low-angle view
|
54 |
|
55 |
+
REMEMBER: Your response must be a single line starting with "{TRIGGER_WORD}" and following the exact format above. No additional text, formatting, or explanations are allowed.
|
56 |
"""
|
57 |
|
58 |
|
|
|
82 |
|
83 |
def extract_caption(line):
|
84 |
"""Extract caption from a line of text."""
|
85 |
+
if TRIGGER_WORD in line:
|
86 |
+
# If caption doesn't start with trigger_word but contains it, extract just that part
|
87 |
+
if not line.startswith(TRIGGER_WORD):
|
88 |
+
return line[line.index(TRIGGER_WORD):]
|
89 |
return line
|
90 |
return ""
|
91 |
|
|
|
117 |
break
|
118 |
|
119 |
if not caption:
|
120 |
+
error_msg = f"Failed to extract a valid caption (containing '{TRIGGER_WORD}') from the response"
|
121 |
error_msg += f"\n\nActual response:\n{full_response}"
|
122 |
raise CaptioningError(error_msg)
|
123 |
|
|
|
155 |
image_count = len(image_strings)
|
156 |
captions = [""] * image_count
|
157 |
|
158 |
+
# Extract lines that start with or contain trigger_word
|
159 |
+
caption_lines = [line for line in lines if TRIGGER_WORD in line]
|
160 |
|
161 |
# Assign captions to images
|
162 |
for i in range(image_count):
|
|
|
171 |
def validate_batch_captions(captions, image_count, full_response):
|
172 |
"""Validate captions extracted from a batch response."""
|
173 |
# Check if all captions are empty or don't contain the trigger word
|
174 |
+
valid_captions = [c for c in captions if c and TRIGGER_WORD in c]
|
175 |
if not valid_captions:
|
176 |
error_msg = "Failed to parse any valid captions from batch response."
|
177 |
error_msg += f"\n\nActual response:\n{full_response}"
|
|
|
180 |
# Check if some captions are missing
|
181 |
if len(valid_captions) < image_count:
|
182 |
missing_count = image_count - len(valid_captions)
|
183 |
+
invalid_captions = [(i, c) for i, c in enumerate(captions) if not c or TRIGGER_WORD not in c]
|
184 |
error_msg = f"Failed to parse captions for {missing_count} of {image_count} images in batch mode"
|
185 |
error_msg += "\n\nMalformed captions:"
|
186 |
for idx, caption in invalid_captions:
|
|
|
204 |
captions = []
|
205 |
with open(file_path, 'r') as file:
|
206 |
for line in file:
|
207 |
+
if line.startswith(TRIGGER_WORD):
|
208 |
captions.append(line.strip())
|
209 |
return captions
|