prithivMLmods commited on
Commit
bdb88e6
·
verified ·
1 Parent(s): 335625e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +355 -359
app.py CHANGED
@@ -1,372 +1,368 @@
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  import torch
3
- from diffusers import FluxFillPipeline
4
- from diffusers.utils import load_image
5
- from PIL import Image, ImageDraw
6
  import numpy as np
7
- import spaces
8
- from huggingface_hub import hf_hub_download
9
-
10
- pipe = FluxFillPipeline.from_pretrained(
11
- "black-forest-labs/FLUX.1-Fill-dev",
12
- torch_dtype=torch.bfloat16
13
- ).to("cuda")
14
-
15
- def can_expand(source_width, source_height, target_width, target_height, alignment):
16
- if alignment in ("Left", "Right") and source_width >= target_width:
17
- return False
18
- if alignment in ("Top", "Bottom") and source_height >= target_height:
19
- return False
20
- return True
21
-
22
- def prepare_image_and_mask(image, width, height, overlap_percentage, resize_option, custom_resize_percentage, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom):
23
- target_size = (width, height)
24
-
25
- scale_factor = min(target_size[0] / image.width, target_size[1] / image.height)
26
- new_width = int(image.width * scale_factor)
27
- new_height = int(image.height * scale_factor)
28
-
29
- source = image.resize((new_width, new_height), Image.LANCZOS)
30
-
31
- if resize_option == "Full":
32
- resize_percentage = 100
33
- elif resize_option == "75%":
34
- resize_percentage = 75
35
- elif resize_option == "50%":
36
- resize_percentage = 50
37
- elif resize_option == "33%":
38
- resize_percentage = 33
39
- elif resize_option == "25%":
40
- resize_percentage = 25
41
- else: # Custom
42
- resize_percentage = custom_resize_percentage
43
-
44
- # Calculate new dimensions based on percentage
45
- resize_factor = resize_percentage / 100
46
- new_width = int(source.width * resize_factor)
47
- new_height = int(source.height * resize_factor)
48
-
49
- # Ensure minimum size of 64 pixels
50
- new_width = max(new_width, 64)
51
- new_height = max(new_height, 64)
52
-
53
- # Resize the image
54
- source = source.resize((new_width, new_height), Image.LANCZOS)
55
-
56
- # Calculate the overlap in pixels based on the percentage
57
- overlap_x = int(new_width * (overlap_percentage / 100))
58
- overlap_y = int(new_height * (overlap_percentage / 100))
59
-
60
- # Ensure minimum overlap of 1 pixel
61
- overlap_x = max(overlap_x, 1)
62
- overlap_y = max(overlap_y, 1)
63
-
64
- # Calculate margins based on alignment
65
- if alignment == "Middle":
66
- margin_x = (target_size[0] - new_width) // 2
67
- margin_y = (target_size[1] - new_height) // 2
68
- elif alignment == "Left":
69
- margin_x = 0
70
- margin_y = (target_size[1] - new_height) // 2
71
- elif alignment == "Right":
72
- margin_x = target_size[0] - new_width
73
- margin_y = (target_size[1] - new_height) // 2
74
- elif alignment == "Top":
75
- margin_x = (target_size[0] - new_width) // 2
76
- margin_y = 0
77
- elif alignment == "Bottom":
78
- margin_x = (target_size[0] - new_width) // 2
79
- margin_y = target_size[1] - new_height
80
-
81
- # Adjust margins to eliminate gaps
82
- margin_x = max(0, min(margin_x, target_size[0] - new_width))
83
- margin_y = max(0, min(margin_y, target_size[1] - new_height))
84
-
85
- # Create a new background image and paste the resized source image
86
- background = Image.new('RGB', target_size, (255, 255, 255))
87
- background.paste(source, (margin_x, margin_y))
88
-
89
- # Create the mask
90
- mask = Image.new('L', target_size, 255)
91
- mask_draw = ImageDraw.Draw(mask)
92
 
93
- # Calculate overlap areas
94
- white_gaps_patch = 2
 
 
 
95
 
96
- left_overlap = margin_x + overlap_x if overlap_left else margin_x + white_gaps_patch
97
- right_overlap = margin_x + new_width - overlap_x if overlap_right else margin_x + new_width - white_gaps_patch
98
- top_overlap = margin_y + overlap_y if overlap_top else margin_y + white_gaps_patch
99
- bottom_overlap = margin_y + new_height - overlap_y if overlap_bottom else margin_y + new_height - white_gaps_patch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- if alignment == "Left":
102
- left_overlap = margin_x + overlap_x if overlap_left else margin_x
103
- elif alignment == "Right":
104
- right_overlap = margin_x + new_width - overlap_x if overlap_right else margin_x + new_width
105
- elif alignment == "Top":
106
- top_overlap = margin_y + overlap_y if overlap_top else margin_y
107
- elif alignment == "Bottom":
108
- bottom_overlap = margin_y + new_height - overlap_y if overlap_bottom else margin_y + new_height
109
-
110
- # Draw the mask
111
- mask_draw.rectangle([
112
- (left_overlap, top_overlap),
113
- (right_overlap, bottom_overlap)
114
- ], fill=0)
115
-
116
- return background, mask
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
 
 
 
118
  @spaces.GPU
119
- def inpaint(image, width, height, overlap_percentage, num_inference_steps, resize_option, custom_resize_percentage, prompt_input, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom, progress=gr.Progress(track_tqdm=True)):
120
-
121
- background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, resize_option, custom_resize_percentage, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
- if not can_expand(background.width, background.height, width, height, alignment):
124
- alignment = "Middle"
125
-
126
- cnet_image = background.copy()
127
- cnet_image.paste(0, (0, 0), mask)
128
-
129
- final_prompt = prompt_input
130
-
131
- #generator = torch.Generator(device="cuda").manual_seed(42)
132
-
133
- result = pipe(
134
- prompt=final_prompt,
135
- height=height,
136
- width=width,
137
- image=cnet_image,
138
- mask_image=mask,
139
- num_inference_steps=num_inference_steps,
140
- guidance_scale=30,
141
- ).images[0]
142
-
143
- result = result.convert("RGBA")
144
- cnet_image.paste(result, (0, 0), mask)
145
-
146
- return cnet_image, background
147
-
148
- def preview_image_and_mask(image, width, height, overlap_percentage, resize_option, custom_resize_percentage, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom):
149
- background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, resize_option, custom_resize_percentage, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom)
150
 
151
- preview = background.copy().convert('RGBA')
152
- red_overlay = Image.new('RGBA', background.size, (255, 0, 0, 64))
153
- red_mask = Image.new('RGBA', background.size, (0, 0, 0, 0))
154
- red_mask.paste(red_overlay, (0, 0), mask)
155
- preview = Image.alpha_composite(preview, red_mask)
156
 
157
- return preview
158
-
159
- def clear_result():
160
- return gr.update(value=None)
161
-
162
- def preload_presets(target_ratio, ui_width, ui_height):
163
- if target_ratio == "9:16":
164
- return 720, 1280, gr.update()
165
- elif target_ratio == "16:9":
166
- return 1280, 720, gr.update()
167
- elif target_ratio == "1:1":
168
- return 1024, 1024, gr.update()
169
- elif target_ratio == "Custom":
170
- return ui_width, ui_height, gr.update(open=True)
171
-
172
- def select_the_right_preset(user_width, user_height):
173
- if user_width == 720 and user_height == 1280:
174
- return "9:16"
175
- elif user_width == 1280 and user_height == 720:
176
- return "16:9"
177
- elif user_width == 1024 and user_height == 1024:
178
- return "1:1"
179
  else:
180
- return "Custom"
181
-
182
- def toggle_custom_resize_slider(resize_option):
183
- return gr.update(visible=(resize_option == "Custom"))
184
-
185
- def update_history(new_image, history):
186
- if history is None:
187
- history = []
188
- history.insert(0, new_image)
189
- return history
190
-
191
- css = """
192
- .gradio-container {
193
- max-width: 1250px !important;
194
- }
195
- """
196
-
197
- title = """<h1 align="center">Flux Outpaint Dev 🤩</h1>"""
198
-
199
- with gr.Blocks(css=css) as demo:
200
- with gr.Column():
201
- gr.HTML(title)
202
-
203
- with gr.Row():
204
- with gr.Column():
205
- input_image = gr.Image(
206
- type="pil",
207
- label="Input Image"
208
- )
209
-
210
- with gr.Row():
211
- with gr.Column(scale=2):
212
- prompt_input = gr.Textbox(label="Prompt (Optional)")
213
- with gr.Column(scale=1):
214
- run_button = gr.Button("Generate")
215
-
216
- with gr.Row():
217
- target_ratio = gr.Radio(
218
- label="Image Ratio",
219
- choices=["9:16", "16:9", "1:1", "Custom"],
220
- value="9:16",
221
- scale=3
222
- )
223
- alignment_dropdown = gr.Dropdown(
224
- choices=["Middle", "Left", "Right", "Top", "Bottom"],
225
- value="Middle",
226
- label="Alignment",
227
- )
228
- resize_option = gr.Radio(
229
- label="Resize input image",
230
- choices=["Full", "75%", "50%", "33%", "25%", "Custom"],
231
- value="75%"
232
- )
233
- custom_resize_percentage = gr.Slider(
234
- label="Custom resize (%)",
235
- minimum=1,
236
- maximum=100,
237
- step=1,
238
- value=50,
239
- visible=False
240
- )
241
- with gr.Accordion(label="Advanced settings", open=False) as settings_panel:
242
- with gr.Column():
243
- with gr.Row():
244
- width_slider = gr.Slider(
245
- label="Target Width",
246
- minimum=720,
247
- maximum=1536,
248
- step=8,
249
- value=720,
250
- )
251
- height_slider = gr.Slider(
252
- label="Target Height",
253
- minimum=720,
254
- maximum=1536,
255
- step=8,
256
- value=1280,
257
- )
258
-
259
- num_inference_steps = gr.Slider(label="Steps", minimum=2, maximum=50, step=1, value=28)
260
- with gr.Group():
261
- overlap_percentage = gr.Slider(
262
- label="Mask overlap (%)",
263
- minimum=1,
264
- maximum=50,
265
- value=10,
266
- step=1
267
- )
268
- with gr.Row():
269
- overlap_top = gr.Checkbox(label="Overlap Top", value=True)
270
- overlap_right = gr.Checkbox(label="Overlap Right", value=True)
271
- with gr.Row():
272
- overlap_left = gr.Checkbox(label="Overlap Left", value=True)
273
- overlap_bottom = gr.Checkbox(label="Overlap Bottom", value=True)
274
-
275
- with gr.Column():
276
- preview_button = gr.Button("Preview alignment and mask")
277
-
278
- with gr.Column():
279
- result = gr.Image(
280
- interactive=False,
281
- label="Generated Image",
282
- )
283
- use_as_input_button = gr.Button("Use as Input Image", visible=False)
284
- with gr.Accordion("History and Mask", open=False):
285
- history_gallery = gr.Gallery(label="History", columns=6, object_fit="contain", interactive=False)
286
- preview_image = gr.Image(label="Mask preview")
287
-
288
- def use_output_as_input(output_image):
289
- return output_image
290
-
291
- use_as_input_button.click(
292
- fn=use_output_as_input,
293
- inputs=[result],
294
- outputs=[input_image]
295
- )
296
 
297
- target_ratio.change(
298
- fn=preload_presets,
299
- inputs=[target_ratio, width_slider, height_slider],
300
- outputs=[width_slider, height_slider, settings_panel],
301
- queue=False
302
- )
303
-
304
- width_slider.change(
305
- fn=select_the_right_preset,
306
- inputs=[width_slider, height_slider],
307
- outputs=[target_ratio],
308
- queue=False
309
- )
310
-
311
- height_slider.change(
312
- fn=select_the_right_preset,
313
- inputs=[width_slider, height_slider],
314
- outputs=[target_ratio],
315
- queue=False
316
- )
317
-
318
- resize_option.change(
319
- fn=toggle_custom_resize_slider,
320
- inputs=[resize_option],
321
- outputs=[custom_resize_percentage],
322
- queue=False
323
- )
324
-
325
- run_button.click(
326
- fn=clear_result,
327
- inputs=None,
328
- outputs=result,
329
- ).then(
330
- fn=inpaint,
331
- inputs=[input_image, width_slider, height_slider, overlap_percentage, num_inference_steps,
332
- resize_option, custom_resize_percentage, prompt_input, alignment_dropdown,
333
- overlap_left, overlap_right, overlap_top, overlap_bottom],
334
- outputs=[result, preview_image],
335
- ).then(
336
- fn=lambda x, history: update_history(x, history),
337
- inputs=[result, history_gallery],
338
- outputs=history_gallery,
339
- ).then(
340
- fn=lambda: gr.update(visible=True),
341
- inputs=None,
342
- outputs=use_as_input_button,
343
- )
344
-
345
- prompt_input.submit(
346
- fn=clear_result,
347
- inputs=None,
348
- outputs=result,
349
- ).then(
350
- fn=inpaint,
351
- inputs=[input_image, width_slider, height_slider, overlap_percentage, num_inference_steps, resize_option, custom_resize_percentage, prompt_input, alignment_dropdown,
352
- overlap_left, overlap_right, overlap_top, overlap_bottom],
353
- outputs=[result, preview_image],
354
- ).then(
355
- fn=lambda x, history: update_history(x, history),
356
- inputs=[result, history_gallery],
357
- outputs=history_gallery,
358
- ).then(
359
- fn=lambda: gr.update(visible=True),
360
- inputs=None,
361
- outputs=use_as_input_button,
362
- )
363
-
364
- preview_button.click(
365
- fn=preview_image_and_mask,
366
- inputs=[input_image, width_slider, height_slider, overlap_percentage, resize_option, custom_resize_percentage, alignment_dropdown,
367
- overlap_left, overlap_right, overlap_top, overlap_bottom],
368
- outputs=preview_image,
369
- queue=False
370
- )
371
-
372
- demo.queue(max_size=12).launch(share=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import uuid
4
+ import json
5
+ import time
6
+ import asyncio
7
+ import re
8
+ from threading import Thread
9
+
10
  import gradio as gr
11
+ import spaces
12
  import torch
 
 
 
13
  import numpy as np
14
+ from PIL import Image
15
+ import edge_tts
16
+
17
+ from transformers import (
18
+ AutoModelForCausalLM,
19
+ AutoTokenizer,
20
+ TextIteratorStreamer,
21
+ Qwen2VLForConditionalGeneration,
22
+ AutoProcessor,
23
+ )
24
+ from transformers.image_utils import load_image
25
+ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
26
+
27
+ DESCRIPTION = """
28
+ # SDXL LoRA DLC 🎃
29
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ css = '''
32
+ h1 {
33
+ text-align: center;
34
+ display: block;
35
+ }
36
 
37
+ #duplicate-button {
38
+ margin: auto;
39
+ color: #fff;
40
+ background: #1565c0;
41
+ border-radius: 100vh;
42
+ }
43
+ '''
44
+
45
+ MAX_MAX_NEW_TOKENS = 2048
46
+ DEFAULT_MAX_NEW_TOKENS = 1024
47
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
48
+
49
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
50
+
51
+ # -----------------------
52
+ # Progress Bar Helper
53
+ # -----------------------
54
+ def progress_bar_html(label: str) -> str:
55
+ """
56
+ Returns an HTML snippet for a thin progress bar with a label.
57
+ The progress bar is styled as a dark red animated bar.
58
+ """
59
+ return f'''
60
+ <div style="display: flex; align-items: center;">
61
+ <span style="margin-right: 10px; font-size: 14px;">{label}</span>
62
+ <div style="width: 110px; height: 5px; background-color: #DDA0DD; border-radius: 2px; overflow: hidden;">
63
+ <div style="width: 100%; height: 100%; background-color: #FF00FF; animation: loading 1.5s linear infinite;"></div>
64
+ </div>
65
+ </div>
66
+ <style>
67
+ @keyframes loading {{
68
+ 0% {{ transform: translateX(-100%); }}
69
+ 100% {{ transform: translateX(100%); }}
70
+ }}
71
+ </style>
72
+ '''
73
+
74
+ # -----------------------
75
+ # Text Generation Setup
76
+ # -----------------------
77
+ model_id = "prithivMLmods/FastThink-0.5B-Tiny"
78
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
79
+ model = AutoModelForCausalLM.from_pretrained(
80
+ model_id,
81
+ device_map="auto",
82
+ torch_dtype=torch.bfloat16,
83
+ )
84
+ model.eval()
85
+
86
+ TTS_VOICES = [
87
+ "en-US-JennyNeural", # @tts1
88
+ "en-US-GuyNeural", # @tts2
89
+ ]
90
+
91
+ # -----------------------
92
+ # Multimodal OCR Setup
93
+ # -----------------------
94
+ MODEL_ID = "prithivMLmods/Qwen2-VL-OCR2-2B-Instruct"
95
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
96
+ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
97
+ MODEL_ID,
98
+ trust_remote_code=True,
99
+ torch_dtype=torch.float16
100
+ ).to("cuda").eval()
101
+
102
+ async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
103
+ """Convert text to speech using Edge TTS and save as MP3"""
104
+ communicate = edge_tts.Communicate(text, voice)
105
+ await communicate.save(output_file)
106
+ return output_file
107
+
108
+ def clean_chat_history(chat_history):
109
+ """
110
+ Filter out any chat entries whose "content" is not a string.
111
+ """
112
+ cleaned = []
113
+ for msg in chat_history:
114
+ if isinstance(msg, dict) and isinstance(msg.get("content"), str):
115
+ cleaned.append(msg)
116
+ return cleaned
117
+
118
+ # -----------------------
119
+ # Stable Diffusion Image Generation Setup
120
+ # -----------------------
121
+
122
+ MAX_SEED = np.iinfo(np.int32).max
123
+ USE_TORCH_COMPILE = False
124
+ ENABLE_CPU_OFFLOAD = False
125
+
126
+ if torch.cuda.is_available():
127
+ pipe = StableDiffusionXLPipeline.from_pretrained(
128
+ "SG161222/RealVisXL_V4.0_Lightning",
129
+ torch_dtype=torch.float16,
130
+ use_safetensors=True,
131
+ )
132
+ pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
133
 
134
+ # LoRA options with one example for each.
135
+ LORA_OPTIONS = {
136
+ "Realism": ("prithivMLmods/Canopus-Realism-LoRA", "Canopus-Realism-LoRA.safetensors", "rlms"),
137
+ "Pixar": ("prithivMLmods/Canopus-Pixar-Art", "Canopus-Pixar-Art.safetensors", "pixar"),
138
+ "Photoshoot": ("prithivMLmods/Canopus-Photo-Shoot-Mini-LoRA", "Canopus-Photo-Shoot-Mini-LoRA.safetensors", "photo"),
139
+ "Clothing": ("prithivMLmods/Canopus-Clothing-Adp-LoRA", "Canopus-Dress-Clothing-LoRA.safetensors", "clth"),
140
+ "Interior": ("prithivMLmods/Canopus-Interior-Architecture-0.1", "Canopus-Interior-Architecture-0.1δ.safetensors", "arch"),
141
+ "Fashion": ("prithivMLmods/Canopus-Fashion-Product-Dilation", "Canopus-Fashion-Product-Dilation.safetensors", "fashion"),
142
+ "Minimalistic": ("prithivMLmods/Pegasi-Minimalist-Image-Style", "Pegasi-Minimalist-Image-Style.safetensors", "minimalist"),
143
+ "Modern": ("prithivMLmods/Canopus-Modern-Clothing-Design", "Canopus-Modern-Clothing-Design.safetensors", "mdrnclth"),
144
+ "Animaliea": ("prithivMLmods/Canopus-Animaliea-Artism", "Canopus-Animaliea-Artism.safetensors", "Animaliea"),
145
+ "Wallpaper": ("prithivMLmods/Canopus-Liquid-Wallpaper-Art", "Canopus-Liquid-Wallpaper-Minimalize-LoRA.safetensors", "liquid"),
146
+ "Cars": ("prithivMLmods/Canes-Cars-Model-LoRA", "Canes-Cars-Model-LoRA.safetensors", "car"),
147
+ "PencilArt": ("prithivMLmods/Canopus-Pencil-Art-LoRA", "Canopus-Pencil-Art-LoRA.safetensors", "Pencil Art"),
148
+ "ArtMinimalistic": ("prithivMLmods/Canopus-Art-Medium-LoRA", "Canopus-Art-Medium-LoRA.safetensors", "mdm"),
149
+ }
150
+
151
+ # Load all LoRA weights
152
+ for model_name, weight_name, adapter_name in LORA_OPTIONS.values():
153
+ pipe.load_lora_weights(model_name, weight_name=weight_name, adapter_name=adapter_name)
154
+ pipe.to("cuda")
155
+ else:
156
+ pipe = StableDiffusionXLPipeline.from_pretrained(
157
+ "SG161222/RealVisXL_V4.0_Lightning",
158
+ torch_dtype=torch.float32,
159
+ use_safetensors=True,
160
+ ).to(device)
161
+
162
+ def save_image(img: Image.Image) -> str:
163
+ """Save a PIL image with a unique filename and return the path."""
164
+ unique_name = str(uuid.uuid4()) + ".png"
165
+ img.save(unique_name)
166
+ return unique_name
167
+
168
+ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
169
+ if randomize_seed:
170
+ seed = random.randint(0, MAX_SEED)
171
+ return seed
172
+
173
+ @spaces.GPU(duration=180, enable_queue=True)
174
+ def generate_image(
175
+ prompt: str,
176
+ negative_prompt: str = "",
177
+ seed: int = 0,
178
+ width: int = 1024,
179
+ height: int = 1024,
180
+ guidance_scale: float = 3.0,
181
+ randomize_seed: bool = True,
182
+ lora_model: str = "Realism",
183
+ progress=gr.Progress(track_tqdm=True),
184
+ ):
185
+ seed = int(randomize_seed_fn(seed, randomize_seed))
186
+ effective_negative_prompt = negative_prompt # Use provided negative prompt if any
187
+ model_name, weight_name, adapter_name = LORA_OPTIONS[lora_model]
188
+ pipe.set_adapters(adapter_name)
189
+ outputs = pipe(
190
+ prompt=prompt,
191
+ negative_prompt=effective_negative_prompt,
192
+ width=width,
193
+ height=height,
194
+ guidance_scale=guidance_scale,
195
+ num_inference_steps=28,
196
+ num_images_per_prompt=1,
197
+ cross_attention_kwargs={"scale": 0.65},
198
+ output_type="pil",
199
+ )
200
+ images = outputs.images
201
+ image_paths = [save_image(img) for img in images]
202
+ return image_paths, seed
203
 
204
+ # -----------------------
205
+ # Main Chat/Generation Function
206
+ # -----------------------
207
  @spaces.GPU
208
+ def generate(
209
+ input_dict: dict,
210
+ chat_history: list[dict],
211
+ max_new_tokens: int = 1024,
212
+ temperature: float = 0.6,
213
+ top_p: float = 0.9,
214
+ top_k: int = 50,
215
+ repetition_penalty: float = 1.2,
216
+ ):
217
+ """
218
+ Generates chatbot responses with support for multimodal input, TTS, and image generation.
219
+ Special commands:
220
+ - "@tts1" or "@tts2": triggers text-to-speech.
221
+ - "@<lora_command>": triggers image generation using the new LoRA pipeline.
222
+ Available commands (case-insensitive): @realism, @pixar, @photoshoot, @clothing, @interior, @fashion,
223
+ @minimalistic, @modern, @animaliea, @wallpaper, @cars, @pencilart, @artminimalistic.
224
+ """
225
+ text = input_dict["text"]
226
+ files = input_dict.get("files", [])
227
 
228
+ # Check for image generation command based on LoRA tags.
229
+ lora_mapping = { key.lower(): key for key in LORA_OPTIONS }
230
+ for key_lower, key in lora_mapping.items():
231
+ command_tag = "@" + key_lower
232
+ if text.strip().lower().startswith(command_tag):
233
+ prompt_text = text.strip()[len(command_tag):].strip()
234
+ yield progress_bar_html(f"Processing Image Generation ({key} style)")
235
+ image_paths, used_seed = generate_image(
236
+ prompt=prompt_text,
237
+ negative_prompt="",
238
+ seed=1,
239
+ width=1024,
240
+ height=1024,
241
+ guidance_scale=3,
242
+ randomize_seed=True,
243
+ lora_model=key,
244
+ )
245
+ yield progress_bar_html("Finalizing Image Generation")
246
+ yield gr.Image(image_paths[0])
247
+ return
 
 
 
 
 
 
 
248
 
249
+ # Check for TTS command (@tts1 or @tts2)
250
+ tts_prefix = "@tts"
251
+ is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
252
+ voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
 
253
 
254
+ if is_tts and voice_index:
255
+ voice = TTS_VOICES[voice_index - 1]
256
+ text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
257
+ conversation = [{"role": "user", "content": text}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  else:
259
+ voice = None
260
+ text = text.replace(tts_prefix, "").strip()
261
+ conversation = clean_chat_history(chat_history)
262
+ conversation.append({"role": "user", "content": text})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
 
264
+ if files:
265
+ if len(files) > 1:
266
+ images = [load_image(image) for image in files]
267
+ elif len(files) == 1:
268
+ images = [load_image(files[0])]
269
+ else:
270
+ images = []
271
+ messages = [{
272
+ "role": "user",
273
+ "content": [
274
+ *[{"type": "image", "image": image} for image in images],
275
+ {"type": "text", "text": text},
276
+ ]
277
+ }]
278
+ prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
279
+ inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
280
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
281
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
282
+ thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
283
+ thread.start()
284
+
285
+ buffer = ""
286
+ yield progress_bar_html("Processing with Qwen2VL Ocr")
287
+ for new_text in streamer:
288
+ buffer += new_text
289
+ buffer = buffer.replace("<|im_end|>", "")
290
+ time.sleep(0.01)
291
+ yield buffer
292
+ else:
293
+ input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
294
+ if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
295
+ input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
296
+ gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
297
+ input_ids = input_ids.to(model.device)
298
+ streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
299
+ generation_kwargs = {
300
+ "input_ids": input_ids,
301
+ "streamer": streamer,
302
+ "max_new_tokens": max_new_tokens,
303
+ "do_sample": True,
304
+ "top_p": top_p,
305
+ "top_k": top_k,
306
+ "temperature": temperature,
307
+ "num_beams": 1,
308
+ "repetition_penalty": repetition_penalty,
309
+ }
310
+ t = Thread(target=model.generate, kwargs=generation_kwargs)
311
+ t.start()
312
+
313
+ outputs = []
314
+ for new_text in streamer:
315
+ outputs.append(new_text)
316
+ yield "".join(outputs)
317
+
318
+ final_response = "".join(outputs)
319
+ yield final_response
320
+
321
+ if is_tts and voice:
322
+ output_file = asyncio.run(text_to_speech(final_response, voice))
323
+ yield gr.Audio(output_file, autoplay=True)
324
+
325
+ # -----------------------
326
+ # Gradio Chat Interface
327
+ # -----------------------
328
+ demo = gr.ChatInterface(
329
+ fn=generate,
330
+ additional_inputs=[
331
+ gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
332
+ gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
333
+ gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
334
+ gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
335
+ gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
336
+ ],
337
+ examples=[
338
+ ['@realism Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic'],
339
+ ["@pixar A young man with light brown wavy hair and light brown eyes sitting in an armchair and looking directly at the camera, pixar style, disney pixar, office background, ultra detailed, 1 man"],
340
+ ["@realism A futuristic cityscape with neon lights"],
341
+ ["@photoshoot A portrait of a person with dramatic lighting"],
342
+ [{"text": "summarize the letter", "files": ["examples/1.png"]}],
343
+ ["Python Program for Array Rotation"],
344
+ ["@tts1 Who is Nikola Tesla, and why did he die?"],
345
+ ["@clothing Fashionable streetwear in an urban environment"],
346
+ ["@interior A modern living room interior with minimalist design"],
347
+ ["@fashion A runway model in haute couture"],
348
+ ["@minimalistic A simple and elegant design of a serene landscape"],
349
+ ["@modern A contemporary art piece with abstract geometric shapes"],
350
+ ["@animaliea A cute animal portrait with vibrant colors"],
351
+ ["@wallpaper A scenic mountain range perfect for a desktop wallpaper"],
352
+ ["@cars A sleek sports car cruising on a city street"],
353
+ ["@pencilart A detailed pencil sketch of a historic building"],
354
+ ["@artminimalistic An artistic minimalist composition with subtle tones"],
355
+ ["@tts2 What causes rainbows to form?"],
356
+ ],
357
+ cache_examples=False,
358
+ type="messages",
359
+ description=DESCRIPTION,
360
+ css=css,
361
+ fill_height=True,
362
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="default [text, vision] , scroll down examples to explore more art styles"),
363
+ stop_btn="Stop Generation",
364
+ multimodal=True,
365
+ )
366
+
367
+ if __name__ == "__main__":
368
+ demo.queue(max_size=20).launch(ssr_mode=False, share=True)