Manireddy1508 commited on
Commit
9362fe6
·
verified ·
1 Parent(s): 55c7101

Update utils/planner.py

Browse files
Files changed (1) hide show
  1. utils/planner.py +30 -23
utils/planner.py CHANGED
@@ -24,7 +24,7 @@ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base
24
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
25
 
26
  # ----------------------------
27
- # 🧠 Load CLIP Tokenizer (for optional diagnostics)
28
  # ----------------------------
29
  tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
30
 
@@ -36,6 +36,8 @@ def generate_blip_caption(image: Image.Image) -> str:
36
  inputs = processor(images=image, return_tensors="pt").to(device)
37
  out = blip_model.generate(**inputs, max_length=50)
38
  caption = processor.decode(out[0], skip_special_tokens=True)
 
 
39
  print(f"🖼️ BLIP Caption: {caption}")
40
  return caption
41
  except Exception as e:
@@ -43,11 +45,11 @@ def generate_blip_caption(image: Image.Image) -> str:
43
  return "a product image"
44
 
45
  # ----------------------------
46
- # 🧠 GPT Scene Planning
47
  # ----------------------------
48
  SCENE_SYSTEM_INSTRUCTIONS = """
49
  You are a scene planning assistant for an AI image generation system.
50
- Your job is to take a caption from a product image and a user prompt, then return a structured JSON with:
51
  - scene (environment, setting)
52
  - subject (main_actor)
53
  - objects (main_product or items)
@@ -59,7 +61,13 @@ Respond ONLY in raw JSON format. Do NOT include explanations.
59
  def extract_scene_plan(prompt: str, image: Image.Image) -> dict:
60
  try:
61
  caption = generate_blip_caption(image)
62
- merged_prompt = f"Image Caption: {caption}\nUser Prompt: {prompt}"
 
 
 
 
 
 
63
 
64
  response = client.chat.completions.create(
65
  model="gpt-4o-mini-2024-07-18",
@@ -73,10 +81,15 @@ def extract_scene_plan(prompt: str, image: Image.Image) -> dict:
73
  content = response.choices[0].message.content
74
  print("🧠 Scene Plan (Raw):", content)
75
 
76
- # Optional logging
77
  os.makedirs("logs", exist_ok=True)
78
  with open("logs/scene_plans.jsonl", "a") as f:
79
- f.write(json.dumps({"caption": caption, "prompt": prompt, "scene_plan": content}) + "\n")
 
 
 
 
 
80
 
81
  return json.loads(content)
82
 
@@ -91,48 +104,42 @@ def extract_scene_plan(prompt: str, image: Image.Image) -> dict:
91
  }
92
 
93
  # ----------------------------
94
- # ✨ GPT-Powered Prompt Variations (77-tokens safe)
95
  # ----------------------------
96
  ENRICHED_PROMPT_INSTRUCTIONS = """
97
  You are a prompt engineer for an AI image generation model.
98
- Given a structured scene plan and product prompt, generate a visually descriptive enriched prompt that:
99
-
100
  1. Describes the subject, product, setting, and layout clearly
101
- 2. Stays strictly under 77 tokens (CLIP limit for SDXL)
102
- 3. Is natural, realistic, and suitable for Stable Diffusion XL
103
- 4. Does NOT include quotes, explanations, or bullet points — just the enriched prompt
104
-
105
- Return only the prompt as a string.
106
  """
107
 
108
  def generate_prompt_variations_from_scene(scene_plan: dict, base_prompt: str, n: int = 3) -> list:
109
  prompts = []
110
  for _ in range(n):
111
  try:
112
- user_input = f"Scene Plan:\n{json.dumps(scene_plan)}\n\nOriginal User Prompt:\n{base_prompt}"
113
  response = client.chat.completions.create(
114
  model="gpt-4o-mini-2024-07-18",
115
  messages=[
116
  {"role": "system", "content": ENRICHED_PROMPT_INSTRUCTIONS},
117
  {"role": "user", "content": user_input}
118
  ],
119
- temperature=0.5,
120
  max_tokens=100
121
  )
122
  enriched = response.choices[0].message.content.strip()
123
-
124
- # Optional: check token count for debug
125
  token_count = len(tokenizer(enriched)["input_ids"])
126
  print(f"📝 Enriched Prompt ({token_count} tokens): {enriched}")
127
-
128
  prompts.append(enriched)
129
  except Exception as e:
130
- print("⚠️ Prompt variation fallback:", e)
131
  prompts.append(base_prompt)
132
  return prompts
133
 
134
  # ----------------------------
135
- # ❌ Generate Negative Prompt
136
  # ----------------------------
137
  NEGATIVE_SYSTEM_PROMPT = """
138
  You are a prompt engineer. Given a structured scene plan, generate a short negative prompt
@@ -152,9 +159,9 @@ def generate_negative_prompt_from_scene(scene_plan: dict) -> str:
152
  temperature=0.2,
153
  max_tokens=100
154
  )
155
- negative = response.choices[0].message.content.strip()
156
- return negative
157
  except Exception as e:
158
  print("❌ Negative Prompt Error:", e)
159
  return "blurry, distorted, low quality, deformed, watermark"
160
 
 
 
24
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
25
 
26
  # ----------------------------
27
+ # 🧠 Load CLIP Tokenizer (for token check)
28
  # ----------------------------
29
  tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
30
 
 
36
  inputs = processor(images=image, return_tensors="pt").to(device)
37
  out = blip_model.generate(**inputs, max_length=50)
38
  caption = processor.decode(out[0], skip_special_tokens=True)
39
+ # Clean duplicate tokens
40
+ caption = " ".join(dict.fromkeys(caption.split()))
41
  print(f"🖼️ BLIP Caption: {caption}")
42
  return caption
43
  except Exception as e:
 
45
  return "a product image"
46
 
47
  # ----------------------------
48
+ # 🧠 GPT Scene Planning with Caption + Visual Style
49
  # ----------------------------
50
  SCENE_SYSTEM_INSTRUCTIONS = """
51
  You are a scene planning assistant for an AI image generation system.
52
+ Your job is to take a caption from a product image, a visual style hint, and a user prompt, then return a structured JSON with:
53
  - scene (environment, setting)
54
  - subject (main_actor)
55
  - objects (main_product or items)
 
61
  def extract_scene_plan(prompt: str, image: Image.Image) -> dict:
62
  try:
63
  caption = generate_blip_caption(image)
64
+ visual_hint = caption if "shoe" in caption or "product" in caption else "low-top product photo on white background"
65
+
66
+ merged_prompt = (
67
+ f"Image Caption: {caption}\n"
68
+ f"Image Visual Style: {visual_hint}\n"
69
+ f"User Prompt: {prompt}"
70
+ )
71
 
72
  response = client.chat.completions.create(
73
  model="gpt-4o-mini-2024-07-18",
 
81
  content = response.choices[0].message.content
82
  print("🧠 Scene Plan (Raw):", content)
83
 
84
+ # Logging
85
  os.makedirs("logs", exist_ok=True)
86
  with open("logs/scene_plans.jsonl", "a") as f:
87
+ f.write(json.dumps({
88
+ "caption": caption,
89
+ "visual_hint": visual_hint,
90
+ "prompt": prompt,
91
+ "scene_plan": content
92
+ }) + "\n")
93
 
94
  return json.loads(content)
95
 
 
104
  }
105
 
106
  # ----------------------------
107
+ # ✨ Enriched Prompt Generation (GPT, 77-token safe)
108
  # ----------------------------
109
  ENRICHED_PROMPT_INSTRUCTIONS = """
110
  You are a prompt engineer for an AI image generation model.
111
+ Given a structured scene plan and a user prompt, generate a single natural-language enriched prompt that:
 
112
  1. Describes the subject, product, setting, and layout clearly
113
+ 2. Uses natural, photo-realistic language
114
+ 3. Stays strictly under 77 tokens (CLIP token limit)
115
+ Return ONLY the enriched prompt string. No explanations.
 
 
116
  """
117
 
118
  def generate_prompt_variations_from_scene(scene_plan: dict, base_prompt: str, n: int = 3) -> list:
119
  prompts = []
120
  for _ in range(n):
121
  try:
122
+ user_input = f"Scene Plan:\n{json.dumps(scene_plan)}\n\nUser Prompt:\n{base_prompt}"
123
  response = client.chat.completions.create(
124
  model="gpt-4o-mini-2024-07-18",
125
  messages=[
126
  {"role": "system", "content": ENRICHED_PROMPT_INSTRUCTIONS},
127
  {"role": "user", "content": user_input}
128
  ],
129
+ temperature=0.4,
130
  max_tokens=100
131
  )
132
  enriched = response.choices[0].message.content.strip()
 
 
133
  token_count = len(tokenizer(enriched)["input_ids"])
134
  print(f"📝 Enriched Prompt ({token_count} tokens): {enriched}")
 
135
  prompts.append(enriched)
136
  except Exception as e:
137
+ print("⚠️ Prompt fallback:", e)
138
  prompts.append(base_prompt)
139
  return prompts
140
 
141
  # ----------------------------
142
+ # ❌ Negative Prompt Generator
143
  # ----------------------------
144
  NEGATIVE_SYSTEM_PROMPT = """
145
  You are a prompt engineer. Given a structured scene plan, generate a short negative prompt
 
159
  temperature=0.2,
160
  max_tokens=100
161
  )
162
+ return response.choices[0].message.content.strip()
 
163
  except Exception as e:
164
  print("❌ Negative Prompt Error:", e)
165
  return "blurry, distorted, low quality, deformed, watermark"
166
 
167
+