Spaces:
Running
Running
Rishi Desai
commited on
Commit
·
c9dac35
1
Parent(s):
8f9a600
init dump with gpt 4o
Browse files- caption.py +102 -0
- main.py +138 -0
caption.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import io
|
3 |
+
import os
|
4 |
+
from openai import OpenAI
|
5 |
+
from PIL import Image
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
|
8 |
+
load_dotenv()
|
9 |
+
|
10 |
+
def get_prompt():
|
11 |
+
return """Automated Image Captioning (for LoRA Training)
|
12 |
+
|
13 |
+
Role: You are an expert AI captioning system generating precise, structured descriptions for AI-generated character images optimized for LoRA model training in Stable Diffusion and Flux.1-dev.
|
14 |
+
|
15 |
+
General Guidelines:
|
16 |
+
1. Prioritize Consistency – Maintain uniform descriptions across all images in a dataset. Avoid introducing variation in features that should remain constant (e.g., fixed traits like eye color, hair color, or markings that are inherently part of the concept and handled during model training).
|
17 |
+
2. Concise and Structured – Only describe visible and significant visual attributes. Use a standardized format for clarity and efficiency.
|
18 |
+
3. Omit Subjective Language – Do not evaluative or emotional descriptors like "beautiful" or "scary."
|
19 |
+
4. Focus on Key Visual Cues – Clearly describe clothing, accessories, pose, facial expression, lighting, and camera angle. Mention distinctive features only if variable (e.g., visible scar, glasses).
|
20 |
+
5. Adapt to Visual Style – Clearly state the artistic style: "anime-style," "photorealistic," "3D-rendered," etc.
|
21 |
+
6. Standard Format – Ensure captions follow a consistent structure.
|
22 |
+
7. Remain Objective – Do not reference known characters, franchises, or people, even if recognizable. Describe only what is visually present.
|
23 |
+
|
24 |
+
Avoid Describing These Unless Variable Across Dataset or Uncertain from Concept:
|
25 |
+
- Eye color
|
26 |
+
- Hair color
|
27 |
+
- Skin tone
|
28 |
+
- Tattoos or markings if core to the concept
|
29 |
+
- Known accessories that always appear (unless outfit-specific)
|
30 |
+
|
31 |
+
Updated Caption Format:
|
32 |
+
tr1gger [Style], [Notable Visual Features], [Clothing], [Pose], [Expression], [Lighting], [Camera Angle]
|
33 |
+
|
34 |
+
Captioning Principles:
|
35 |
+
- Emphasize visual variation and context-specific details (outfit, pose, lighting, expression, camera angle).
|
36 |
+
- Exclude constant traits unless clearly deviating.
|
37 |
+
- Include visible clothing and accessories.
|
38 |
+
- Clearly define pose and facial expression (neutral, smiling, aggressive).
|
39 |
+
- Specify lighting conditions (soft lighting, harsh shadows, glowing backlight).
|
40 |
+
- Explicitly state camera angle (e.g., front view, right side profile, low-angle, high-angle, overhead).
|
41 |
+
- Avoid mentioning real or fictional identities.
|
42 |
+
- Always prefix with the trigger word "tr1gger."
|
43 |
+
|
44 |
+
Updated Examples:
|
45 |
+
- tr1gger photorealistic, tactical vest and gloves, standing in profile, neutral expression, overcast lighting, side profile
|
46 |
+
- tr1gger 3D-rendered, hooded cloak with digital pattern, seated cross-legged, calm expression, low ambient lighting, front view
|
47 |
+
- tr1gger anime-style, school uniform with blue necktie, standing with arms behind back, gentle smile, soft daylight, three-quarter view
|
48 |
+
- tr1gger photorealistic, long trench coat and combat boots, walking through rain-soaked street, determined expression, dramatic shadows, low-angle view
|
49 |
+
"""
|
50 |
+
|
51 |
+
def caption_images(images):
|
52 |
+
# Convert PIL images to base64 encoded strings
|
53 |
+
image_strings = []
|
54 |
+
for image in images:
|
55 |
+
buffered = io.BytesIO()
|
56 |
+
image.save(buffered, format="PNG")
|
57 |
+
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
58 |
+
image_strings.append(img_str)
|
59 |
+
|
60 |
+
client = OpenAI()
|
61 |
+
captions = []
|
62 |
+
|
63 |
+
# Start a separate chat session for each image
|
64 |
+
for img_str in image_strings:
|
65 |
+
messages = [
|
66 |
+
{"role": "system", "content": get_prompt()},
|
67 |
+
{
|
68 |
+
"role": "user",
|
69 |
+
"content": [
|
70 |
+
{"type": "text", "text": "Caption this image according to the guidelines."},
|
71 |
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_str}"}}
|
72 |
+
]
|
73 |
+
}
|
74 |
+
]
|
75 |
+
|
76 |
+
# Request caption for the image in a single chat
|
77 |
+
response = client.chat.completions.create(
|
78 |
+
model="gpt-4o",
|
79 |
+
messages=messages,
|
80 |
+
max_tokens=512 # Adjust max_tokens as needed
|
81 |
+
)
|
82 |
+
|
83 |
+
# Extract caption from the response
|
84 |
+
caption = response.choices[0].message.content.strip()
|
85 |
+
captions.append(caption)
|
86 |
+
|
87 |
+
return captions
|
88 |
+
|
89 |
+
# Example usage
|
90 |
+
if __name__ == "__main__":
|
91 |
+
if not os.environ.get("OPENAI_API_KEY"):
|
92 |
+
print("Please update the .env file with your OpenAI API key.")
|
93 |
+
exit(1)
|
94 |
+
|
95 |
+
# Load images
|
96 |
+
image_paths = ['input/daenyrs_hd.jpg', 'input/girl_body.png']
|
97 |
+
images = [Image.open(path).convert("RGB") for path in image_paths]
|
98 |
+
|
99 |
+
# Generate captions
|
100 |
+
captions = caption_images(images)
|
101 |
+
for i, caption in enumerate(captions):
|
102 |
+
print(f"Generated Caption for Image {i+1}: {caption}")
|
main.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import argparse
|
3 |
+
import shutil
|
4 |
+
import sys
|
5 |
+
from pathlib import Path
|
6 |
+
from PIL import Image
|
7 |
+
from caption import caption_images
|
8 |
+
|
9 |
+
def is_image_file(filename):
|
10 |
+
"""Check if a file is an allowed image type."""
|
11 |
+
allowed_extensions = ['.png', '.jpg', '.jpeg', '.webp']
|
12 |
+
return any(filename.lower().endswith(ext) for ext in allowed_extensions)
|
13 |
+
|
14 |
+
def is_unsupported_image(filename):
|
15 |
+
"""Check if a file is an image but not of an allowed type."""
|
16 |
+
unsupported_extensions = ['.bmp', '.gif', '.tiff', '.tif', '.ico', '.svg']
|
17 |
+
return any(filename.lower().endswith(ext) for ext in unsupported_extensions)
|
18 |
+
|
19 |
+
def is_text_file(filename):
|
20 |
+
"""Check if a file is a text file."""
|
21 |
+
return filename.lower().endswith('.txt')
|
22 |
+
|
23 |
+
def validate_input_directory(input_dir):
|
24 |
+
"""Validate that the input directory only contains allowed image formats."""
|
25 |
+
input_path = Path(input_dir)
|
26 |
+
|
27 |
+
unsupported_files = []
|
28 |
+
text_files = []
|
29 |
+
|
30 |
+
for file_path in input_path.iterdir():
|
31 |
+
if file_path.is_file():
|
32 |
+
if is_unsupported_image(file_path.name):
|
33 |
+
unsupported_files.append(file_path.name)
|
34 |
+
elif is_text_file(file_path.name):
|
35 |
+
text_files.append(file_path.name)
|
36 |
+
|
37 |
+
if unsupported_files:
|
38 |
+
print("Error: Unsupported image formats detected.")
|
39 |
+
print("Only .png, .jpg, .jpeg, and .webp files are allowed.")
|
40 |
+
print("The following files are not supported:")
|
41 |
+
for file in unsupported_files:
|
42 |
+
print(f" - {file}")
|
43 |
+
sys.exit(1)
|
44 |
+
|
45 |
+
if text_files:
|
46 |
+
print("Error: Text files detected in the input directory.")
|
47 |
+
print("The input directory should only contain image files to prevent overwriting existing text files.")
|
48 |
+
print("The following text files were found:")
|
49 |
+
for file in text_files:
|
50 |
+
print(f" - {file}")
|
51 |
+
sys.exit(1)
|
52 |
+
|
53 |
+
def process_images(input_dir, output_dir, fix_outfit=False):
|
54 |
+
"""Process all images in the input directory and generate captions."""
|
55 |
+
input_path = Path(input_dir)
|
56 |
+
output_path = Path(output_dir) if output_dir else input_path
|
57 |
+
|
58 |
+
# Validate the input directory first
|
59 |
+
validate_input_directory(input_dir)
|
60 |
+
|
61 |
+
# Create output directory if it doesn't exist
|
62 |
+
os.makedirs(output_path, exist_ok=True)
|
63 |
+
|
64 |
+
# Track the number of processed images
|
65 |
+
processed_count = 0
|
66 |
+
|
67 |
+
# Collect all images into a list
|
68 |
+
images = []
|
69 |
+
image_paths = []
|
70 |
+
|
71 |
+
# Get all files in the input directory
|
72 |
+
for file_path in input_path.iterdir():
|
73 |
+
if file_path.is_file() and is_image_file(file_path.name):
|
74 |
+
try:
|
75 |
+
# Load the image
|
76 |
+
image = Image.open(file_path).convert("RGB")
|
77 |
+
images.append(image)
|
78 |
+
image_paths.append(file_path)
|
79 |
+
except Exception as e:
|
80 |
+
print(f"Error loading {file_path.name}: {e}")
|
81 |
+
|
82 |
+
# Log the number of images found
|
83 |
+
print(f"Found {len(images)} images to process.")
|
84 |
+
|
85 |
+
if not images:
|
86 |
+
print("No valid images found to process.")
|
87 |
+
return
|
88 |
+
|
89 |
+
# Generate captions for all images
|
90 |
+
try:
|
91 |
+
captions = caption_images(images)
|
92 |
+
except Exception as e:
|
93 |
+
print(f"Error generating captions: {e}")
|
94 |
+
return
|
95 |
+
|
96 |
+
# Write captions to files
|
97 |
+
for file_path, caption in zip(image_paths, captions):
|
98 |
+
try:
|
99 |
+
# Create caption file path (same name but with .txt extension)
|
100 |
+
caption_filename = file_path.stem + ".txt"
|
101 |
+
caption_path = input_path / caption_filename
|
102 |
+
|
103 |
+
# Write caption to file
|
104 |
+
with open(caption_path, 'w', encoding='utf-8') as f:
|
105 |
+
f.write(caption)
|
106 |
+
|
107 |
+
# If output directory is different from input, copy files
|
108 |
+
if output_path != input_path:
|
109 |
+
# Copy image to output directory
|
110 |
+
shutil.copy2(file_path, output_path / file_path.name)
|
111 |
+
# Copy caption to output directory
|
112 |
+
shutil.copy2(caption_path, output_path / caption_filename)
|
113 |
+
|
114 |
+
processed_count += 1
|
115 |
+
print(f"Processed {file_path.name} → {caption_filename}")
|
116 |
+
except Exception as e:
|
117 |
+
print(f"Error processing {file_path.name}: {e}")
|
118 |
+
|
119 |
+
print(f"\nProcessing complete. {processed_count} images were captioned.")
|
120 |
+
|
121 |
+
def main():
|
122 |
+
parser = argparse.ArgumentParser(description='Generate captions for images using GPT-4o.')
|
123 |
+
parser.add_argument('--input', type=str, required=True, help='Directory containing images')
|
124 |
+
parser.add_argument('--output', type=str, help='Directory to save images and captions (defaults to input directory)')
|
125 |
+
parser.add_argument('--fix_outfit', action='store_true', help='Flag to indicate if character has one outfit')
|
126 |
+
|
127 |
+
args = parser.parse_args()
|
128 |
+
|
129 |
+
# Validate input directory
|
130 |
+
if not os.path.isdir(args.input):
|
131 |
+
print(f"Error: Input directory '{args.input}' does not exist.")
|
132 |
+
return
|
133 |
+
|
134 |
+
# Process images
|
135 |
+
process_images(args.input, args.output, args.fix_outfit)
|
136 |
+
|
137 |
+
if __name__ == "__main__":
|
138 |
+
main()
|