prithivMLmods commited on
Commit
7fcd908
·
verified ·
1 Parent(s): 78be7e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +269 -246
app.py CHANGED
@@ -1,19 +1,19 @@
1
  import os
2
  import random
3
  import uuid
 
4
  import time
5
  import asyncio
 
6
  from threading import Thread
7
- from typing import Tuple
8
 
9
  import gradio as gr
10
  import spaces
11
  import torch
12
  import numpy as np
13
  from PIL import Image
14
- import cv2
15
 
16
- from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
17
  from transformers import (
18
  AutoModelForCausalLM,
19
  AutoTokenizer,
@@ -22,45 +22,151 @@ from transformers import (
22
  AutoProcessor,
23
  )
24
  from transformers.image_utils import load_image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- # ---------------------------
27
- # Global Settings and Devices
28
- # ---------------------------
29
  MAX_MAX_NEW_TOKENS = 2048
30
  DEFAULT_MAX_NEW_TOKENS = 1024
31
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 
32
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
33
- MAX_SEED = np.iinfo(np.int32).max
34
 
35
- # ---------------------------
36
- # IMAGE GEN LO_RA TAB: SDXL Gen with LoRA Options
37
- # ---------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- # Load the SDXL pipeline
40
- MODEL_ID_SD = os.getenv("MODEL_VAL_PATH") # Path from env variable
41
- if MODEL_ID_SD is None:
42
- MODEL_ID_SD = "SG161222/RealVisXL_V4.0_Lightning" # default fallback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- # Load SDXL pipeline (use GPU if available)
45
- sd_pipe = StableDiffusionXLPipeline.from_pretrained(
46
- MODEL_ID_SD,
47
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
48
- use_safetensors=True,
49
- add_watermarker=False,
50
- ).to(device)
51
- sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
52
  if torch.cuda.is_available():
53
- sd_pipe.text_encoder = sd_pipe.text_encoder.half()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- # Optional: compile or offload if desired
56
- USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
57
- ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
58
- if USE_TORCH_COMPILE:
59
- sd_pipe.compile()
60
- if ENABLE_CPU_OFFLOAD:
61
- sd_pipe.enable_model_cpu_offload()
 
 
 
62
 
63
  def save_image(img: Image.Image) -> str:
 
64
  unique_name = str(uuid.uuid4()) + ".png"
65
  img.save(unique_name)
66
  return unique_name
@@ -70,241 +176,158 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
70
  seed = random.randint(0, MAX_SEED)
71
  return seed
72
 
73
- # LoRA options and style definitions
74
- LORA_OPTIONS = {
75
- "Realism (face/character)👦🏻": ("prithivMLmods/Canopus-Realism-LoRA", "Canopus-Realism-LoRA.safetensors", "rlms"),
76
- "Pixar (art/toons)🙀": ("prithivMLmods/Canopus-Pixar-Art", "Canopus-Pixar-Art.safetensors", "pixar"),
77
- "Photoshoot (camera/film)📸": ("prithivMLmods/Canopus-Photo-Shoot-Mini-LoRA", "Canopus-Photo-Shoot-Mini-LoRA.safetensors", "photo"),
78
- "Clothing (hoodies/pant/shirts)👔": ("prithivMLmods/Canopus-Clothing-Adp-LoRA", "Canopus-Dress-Clothing-LoRA.safetensors", "clth"),
79
- "Interior Architecture (house/hotel)🏠": ("prithivMLmods/Canopus-Interior-Architecture-0.1", "Canopus-Interior-Architecture-0.1δ.safetensors", "arch"),
80
- "Fashion Product (wearing/usable)👜": ("prithivMLmods/Canopus-Fashion-Product-Dilation", "Canopus-Fashion-Product-Dilation.safetensors", "fashion"),
81
- "Minimalistic Image (minimal/detailed)🏞️": ("prithivMLmods/Pegasi-Minimalist-Image-Style", "Pegasi-Minimalist-Image-Style.safetensors", "minimalist"),
82
- "Modern Clothing (trend/new)👕": ("prithivMLmods/Canopus-Modern-Clothing-Design", "Canopus-Modern-Clothing-Design.safetensors", "mdrnclth"),
83
- "Animaliea (farm/wild)🫎": ("prithivMLmods/Canopus-Animaliea-Artism", "Canopus-Animaliea-Artism.safetensors", "Animaliea"),
84
- "Liquid Wallpaper (minimal/illustration)🖼️": ("prithivMLmods/Canopus-Liquid-Wallpaper-Art", "Canopus-Liquid-Wallpaper-Minimalize-LoRA.safetensors", "liquid"),
85
- "Canes Cars (realistic/futurecars)🚘": ("prithivMLmods/Canes-Cars-Model-LoRA", "Canes-Cars-Model-LoRA.safetensors", "car"),
86
- "Pencil Art (characteristic/creative)✏️": ("prithivMLmods/Canopus-Pencil-Art-LoRA", "Canopus-Pencil-Art-LoRA.safetensors", "Pencil Art"),
87
- "Art Minimalistic (paint/semireal)🎨": ("prithivMLmods/Canopus-Art-Medium-LoRA", "Canopus-Art-Medium-LoRA.safetensors", "mdm"),
88
- }
89
-
90
- style_list = [
91
- {
92
- "name": "3840 x 2160",
93
- "prompt": "hyper-realistic 8K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
94
- "negative_prompt": "cartoonish, low resolution, blurry, simplistic, abstract, deformed, ugly",
95
- },
96
- {
97
- "name": "2560 x 1440",
98
- "prompt": "hyper-realistic 4K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
99
- "negative_prompt": "cartoonish, low resolution, blurry, simplistic, abstract, deformed, ugly",
100
- },
101
- {
102
- "name": "HD+",
103
- "prompt": "hyper-realistic 2K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
104
- "negative_prompt": "cartoonish, low resolution, blurry, simplistic, abstract, deformed, ugly",
105
- },
106
- {
107
- "name": "Style Zero",
108
- "prompt": "{prompt}",
109
- "negative_prompt": "",
110
- },
111
- ]
112
- styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
113
- DEFAULT_STYLE_NAME = "3840 x 2160"
114
- STYLE_NAMES = list(styles.keys())
115
-
116
- def apply_style(style_name: str, positive: str, negative: str = "") -> Tuple[str, str]:
117
- if style_name in styles:
118
- p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
119
- else:
120
- p, n = styles[DEFAULT_STYLE_NAME]
121
- return p.replace("{prompt}", positive), n + negative
122
-
123
  @spaces.GPU(duration=180, enable_queue=True)
124
- def generate_image_lora(
125
- prompt: str,
126
- negative_prompt: str = "",
127
- use_negative_prompt: bool = True,
128
- seed: int = 0,
129
- width: int = 1024,
130
- height: int = 1024,
131
- guidance_scale: float = 3,
132
- randomize_seed: bool = False,
133
- style_name: str = DEFAULT_STYLE_NAME,
134
- lora_model: str = "Realism (face/character)👦🏻",
135
- progress=gr.Progress(track_tqdm=True),
136
- ):
137
  seed = int(randomize_seed_fn(seed, randomize_seed))
138
- positive_prompt, effective_negative_prompt = apply_style(style_name, prompt, negative_prompt)
139
- if not use_negative_prompt:
140
- effective_negative_prompt = ""
141
- # Set LoRA adapter based on selection
142
  model_name, weight_name, adapter_name = LORA_OPTIONS[lora_model]
143
- sd_pipe.load_lora_weights(model_name, weight_name=weight_name, adapter_name=adapter_name)
144
- sd_pipe.to(device)
145
-
146
- outputs = sd_pipe(
147
- prompt=positive_prompt,
148
- negative_prompt=effective_negative_prompt,
149
- width=width,
150
- height=height,
151
- guidance_scale=guidance_scale,
152
- num_inference_steps=20,
153
- num_images_per_prompt=1,
154
- cross_attention_kwargs={"scale": 0.65},
155
- output_type="pil",
156
  )
157
- image_paths = [save_image(img) for img in outputs.images]
 
158
  return image_paths, seed
159
 
160
- # ---------------------------
161
- # Qwen 2 VL OCR TAB
162
- # ---------------------------
163
- MODEL_ID_QWEN = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
164
- processor = AutoProcessor.from_pretrained(MODEL_ID_QWEN, trust_remote_code=True)
165
- model_m = Qwen2VLForConditionalGeneration.from_pretrained(
166
- MODEL_ID_QWEN,
167
- trust_remote_code=True,
168
- torch_dtype=torch.float16
169
- ).to("cuda" if torch.cuda.is_available() else "cpu").eval()
170
-
171
- @spaces.GPU
172
- def qwen2vl_ocr_generate(
173
- prompt: str,
174
- file: list,
175
- max_new_tokens: int = 1024,
176
- temperature: float = 0.6,
177
- top_p: float = 0.9,
178
- top_k: int = 50,
179
- repetition_penalty: float = 1.2,
180
- ):
181
- # In this tab, we assume the user supplies an image (or multiple images) for OCR.
182
- images = []
183
- if file:
184
- # load image(s) using the helper function
185
- for f in file:
186
- images.append(load_image(f))
187
- else:
188
- # If no image provided, use an empty list
189
- images = []
190
- # Build message content: We use a simple chat template with text and images.
191
- messages = [{
192
- "role": "user",
193
- "content": [
194
- *[{"type": "image", "image": image} for image in images],
195
- {"type": "text", "text": prompt},
196
- ]
197
- }]
198
- prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
199
- inputs = processor(text=[prompt_full], images=images, return_tensors="pt", padding=True).to("cuda" if torch.cuda.is_available() else "cpu")
200
- # Use non-streaming generation for simplicity
201
- output_ids = model_m.generate(
202
- **inputs,
203
- max_new_tokens=max_new_tokens,
204
- do_sample=True,
205
- temperature=temperature,
206
- top_p=top_p,
207
- top_k=top_k,
208
- repetition_penalty=repetition_penalty,
209
- )
210
- final_response = processor.tokenizer.decode(output_ids[0], skip_special_tokens=True)
211
- return final_response
212
-
213
- # ---------------------------
214
- # CHAT INTERFACE TAB (Text-only)
215
- # ---------------------------
216
- # Load text-only model and tokenizer
217
- model_id_text = "prithivMLmods/FastThink-0.5B-Tiny"
218
- tokenizer = AutoTokenizer.from_pretrained(model_id_text)
219
- model = AutoModelForCausalLM.from_pretrained(
220
- model_id_text,
221
- device_map="auto",
222
- torch_dtype=torch.bfloat16,
223
- )
224
- model.eval()
225
-
226
- def chat_generate(prompt: str, max_new_tokens: int = 1024, temperature: float = 0.6,
227
- top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2):
228
- # For simplicity, use a basic generate without streaming.
229
- input_ids = tokenizer.encode(prompt, return_tensors="pt")
230
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
231
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
232
  input_ids = input_ids.to(model.device)
233
- output_ids = model.generate(
234
- input_ids=input_ids,
235
- max_new_tokens=max_new_tokens,
236
- do_sample=True,
237
- temperature=temperature,
238
- top_p=top_p,
239
- top_k=top_k,
240
- repetition_penalty=repetition_penalty,
241
- )
242
- response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
243
- return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
- # ---------------------------
246
- # GRADIO INTERFACE WITH TABS
247
- # ---------------------------
248
- with gr.Blocks(title="Multi-Modal Playground") as demo:
249
- gr.Markdown("# Multi-Modal Playground")
250
 
251
- with gr.Tab("Image Gen LoRA"):
252
- gr.Markdown("## Generate Images using SDXL + LoRA")
253
- with gr.Row():
254
- prompt_img = gr.Textbox(label="Prompt", placeholder="Enter your image prompt here")
255
- negative_prompt_img = gr.Textbox(label="Negative Prompt", placeholder="Enter negative prompt (optional)", lines=2)
256
  with gr.Row():
257
- use_negative = gr.Checkbox(label="Use Negative Prompt", value=True)
258
- seed_img = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
259
- randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
260
  with gr.Row():
261
- width_img = gr.Slider(label="Width", minimum=512, maximum=2048, step=8, value=1024)
262
- height_img = gr.Slider(label="Height", minimum=512, maximum=2048, step=8, value=1024)
263
  with gr.Row():
264
- guidance_scale_img = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=20.0, step=0.1, value=3.0)
 
265
  with gr.Row():
266
- style_selection = gr.Radio(choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME, label="Quality Style")
267
- lora_selection = gr.Dropdown(choices=list(LORA_OPTIONS.keys()), value="Realism (face/character)👦🏻", label="LoRA Selection")
268
- run_img = gr.Button("Generate Image")
269
- gallery = gr.Gallery(label="Generated Images", columns=1).style(full_width=True)
270
- output_seed = gr.Number(label="Seed Used")
271
- run_img.click(
272
- generate_image_lora,
273
- inputs=[prompt_img, negative_prompt_img, use_negative, seed_img, width_img, height_img, guidance_scale_img,
274
- randomize_seed, style_selection, lora_selection],
275
- outputs=[gallery, output_seed]
276
  )
277
 
278
- with gr.Tab("Qwen 2 VL OCR"):
279
- gr.Markdown("## Extract and Generate Text from Images (OCR)")
280
- with gr.Row():
281
- prompt_ocr = gr.Textbox(label="OCR Prompt", placeholder="Enter instructions for OCR/text extraction")
282
- file_ocr = gr.File(label="Upload Image", file_types=["image"], file_count="multiple")
283
- run_ocr = gr.Button("Run OCR")
284
- output_ocr = gr.Textbox(label="OCR Output")
285
- run_ocr.click(
286
- qwen2vl_ocr_generate,
287
- inputs=[prompt_ocr, file_ocr],
288
- outputs=output_ocr
 
 
 
 
 
289
  )
290
 
291
- with gr.Tab("Chat Interface"):
292
- gr.Markdown("## Chat with the Text-Only Model")
293
- chat_input = gr.Textbox(label="Enter your message", placeholder="Say something...")
294
- max_tokens_chat = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
295
- temperature_chat = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
296
- top_p_chat = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
297
- top_k_chat = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
298
- rep_penalty_chat = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
299
- run_chat = gr.Button("Send")
300
- chat_output = gr.Textbox(label="Response")
301
- run_chat.click(
302
- chat_generate,
303
- inputs=[chat_input, max_tokens_chat, temperature_chat, top_p_chat, top_k_chat, rep_penalty_chat],
304
- outputs=chat_output
305
  )
306
 
307
- gr.Markdown("**Adjust parameters in each tab as needed.**")
 
 
 
 
 
 
 
 
 
308
 
309
- if __name__ == "__main__":
310
- demo.queue(max_size=20).launch(share=True)
 
1
  import os
2
  import random
3
  import uuid
4
+ import json
5
  import time
6
  import asyncio
7
+ import re
8
  from threading import Thread
 
9
 
10
  import gradio as gr
11
  import spaces
12
  import torch
13
  import numpy as np
14
  from PIL import Image
15
+ import edge_tts
16
 
 
17
  from transformers import (
18
  AutoModelForCausalLM,
19
  AutoTokenizer,
 
22
  AutoProcessor,
23
  )
24
  from transformers.image_utils import load_image
25
+ from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
26
+
27
+ DESCRIPTION = """
28
+ # Gen Vision 🎃
29
+ Separate Tabs for Chat, Image Generation (LoRA), Qwen2 VL OCR and Text-to-Speech
30
+ """
31
+
32
+ css = '''
33
+ h1 {
34
+ text-align: center;
35
+ display: block;
36
+ }
37
+
38
+ #duplicate-button {
39
+ margin: auto;
40
+ color: #fff;
41
+ background: #1565c0;
42
+ border-radius: 100vh;
43
+ }
44
+ '''
45
 
 
 
 
46
  MAX_MAX_NEW_TOKENS = 2048
47
  DEFAULT_MAX_NEW_TOKENS = 1024
48
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
49
+
50
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
51
 
52
+ # -----------------------
53
+ # Progress Bar Helper
54
+ # -----------------------
55
+ def progress_bar_html(label: str) -> str:
56
+ """
57
+ Returns an HTML snippet for a thin progress bar with a label.
58
+ The progress bar is styled as a dark red animated bar.
59
+ """
60
+ return f'''
61
+ <div style="display: flex; align-items: center;">
62
+ <span style="margin-right: 10px; font-size: 14px;">{label}</span>
63
+ <div style="width: 110px; height: 5px; background-color: #DDA0DD; border-radius: 2px; overflow: hidden;">
64
+ <div style="width: 100%; height: 100%; background-color: #FF00FF; animation: loading 1.5s linear infinite;"></div>
65
+ </div>
66
+ </div>
67
+ <style>
68
+ @keyframes loading {{
69
+ 0% {{ transform: translateX(-100%); }}
70
+ 100% {{ transform: translateX(100%); }}
71
+ }}
72
+ </style>
73
+ '''
74
 
75
+ # -----------------------
76
+ # Text Generation Setup (Chat)
77
+ # -----------------------
78
+ model_id = "prithivMLmods/FastThink-0.5B-Tiny"
79
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
80
+ model = AutoModelForCausalLM.from_pretrained(
81
+ model_id,
82
+ device_map="auto",
83
+ torch_dtype=torch.bfloat16,
84
+ )
85
+ model.eval()
86
+
87
+ # -----------------------
88
+ # TTS Setup
89
+ # -----------------------
90
+ TTS_VOICES = [
91
+ "en-US-JennyNeural",
92
+ "en-US-GuyNeural",
93
+ ]
94
+
95
+ async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
96
+ """Convert text to speech using Edge TTS and save as MP3"""
97
+ communicate = edge_tts.Communicate(text, voice)
98
+ await communicate.save(output_file)
99
+ return output_file
100
+
101
+ # -----------------------
102
+ # Utility: Clean Chat History
103
+ # -----------------------
104
+ def clean_chat_history(chat_history):
105
+ """
106
+ Filter out any chat entries whose "content" is not a string.
107
+ """
108
+ cleaned = []
109
+ for msg in chat_history:
110
+ if isinstance(msg, dict) and isinstance(msg.get("content"), str):
111
+ cleaned.append(msg)
112
+ return cleaned
113
+
114
+ # -----------------------
115
+ # Qwen2 VL OCR Setup
116
+ # -----------------------
117
+ OCR_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR2-2B-Instruct"
118
+ processor = AutoProcessor.from_pretrained(OCR_MODEL_ID, trust_remote_code=True)
119
+ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
120
+ OCR_MODEL_ID,
121
+ trust_remote_code=True,
122
+ torch_dtype=torch.float16
123
+ ).to("cuda").eval()
124
+
125
+ # -----------------------
126
+ # Stable Diffusion Image Generation Setup (LoRA)
127
+ # -----------------------
128
+ MAX_SEED = np.iinfo(np.int32).max
129
+ USE_TORCH_COMPILE = False
130
+ ENABLE_CPU_OFFLOAD = False
131
 
 
 
 
 
 
 
 
 
132
  if torch.cuda.is_available():
133
+ pipe = StableDiffusionXLPipeline.from_pretrained(
134
+ "SG161222/RealVisXL_V4.0_Lightning",
135
+ torch_dtype=torch.float16,
136
+ use_safetensors=True,
137
+ )
138
+ pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
139
+
140
+ # LoRA options with one example for each.
141
+ LORA_OPTIONS = {
142
+ "Realism": ("prithivMLmods/Canopus-Realism-LoRA", "Canopus-Realism-LoRA.safetensors", "rlms"),
143
+ "Pixar": ("prithivMLmods/Canopus-Pixar-Art", "Canopus-Pixar-Art.safetensors", "pixar"),
144
+ "Photoshoot": ("prithivMLmods/Canopus-Photo-Shoot-Mini-LoRA", "Canopus-Photo-Shoot-Mini-LoRA.safetensors", "photo"),
145
+ "Clothing": ("prithivMLmods/Canopus-Clothing-Adp-LoRA", "Canopus-Dress-Clothing-LoRA.safetensors", "clth"),
146
+ "Interior": ("prithivMLmods/Canopus-Interior-Architecture-0.1", "Canopus-Interior-Architecture-0.1δ.safetensors", "arch"),
147
+ "Fashion": ("prithivMLmods/Canopus-Fashion-Product-Dilation", "Canopus-Fashion-Product-Dilation.safetensors", "fashion"),
148
+ "Minimalistic": ("prithivMLmods/Pegasi-Minimalist-Image-Style", "Pegasi-Minimalist-Image-Style.safetensors", "minimalist"),
149
+ "Modern": ("prithivMLmods/Canopus-Modern-Clothing-Design", "Canopus-Modern-Clothing-Design.safetensors", "mdrnclth"),
150
+ "Animaliea": ("prithivMLmods/Canopus-Animaliea-Artism", "Canopus-Animaliea-Artism.safetensors", "Animaliea"),
151
+ "Wallpaper": ("prithivMLmods/Canopus-Liquid-Wallpaper-Art", "Canopus-Liquid-Wallpaper-Minimalize-LoRA.safetensors", "liquid"),
152
+ "Cars": ("prithivMLmods/Canes-Cars-Model-LoRA", "Canes-Cars-Model-LoRA.safetensors", "car"),
153
+ "PencilArt": ("prithivMLmods/Canopus-Pencil-Art-LoRA", "Canopus-Pencil-Art-LoRA.safetensors", "Pencil Art"),
154
+ "ArtMinimalistic": ("prithivMLmods/Canopus-Art-Medium-LoRA", "Canopus-Art-Medium-LoRA.safetensors", "mdm"),
155
+ }
156
 
157
+ # Load all LoRA weights
158
+ for model_name, weight_name, adapter_name in LORA_OPTIONS.values():
159
+ pipe.load_lora_weights(model_name, weight_name=weight_name, adapter_name=adapter_name)
160
+ pipe.to("cuda")
161
+ else:
162
+ pipe = StableDiffusionXLPipeline.from_pretrained(
163
+ "SG161222/RealVisXL_V4.0_Lightning",
164
+ torch_dtype=torch.float32,
165
+ use_safetensors=True,
166
+ ).to(device)
167
 
168
  def save_image(img: Image.Image) -> str:
169
+ """Save a PIL image with a unique filename and return the path."""
170
  unique_name = str(uuid.uuid4()) + ".png"
171
  img.save(unique_name)
172
  return unique_name
 
176
  seed = random.randint(0, MAX_SEED)
177
  return seed
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  @spaces.GPU(duration=180, enable_queue=True)
180
+ def generate_image(prompt: str, negative_prompt: str, seed: int, width: int, height: int, guidance_scale: float, randomize_seed: bool, lora_model: str):
 
 
 
 
 
 
 
 
 
 
 
 
181
  seed = int(randomize_seed_fn(seed, randomize_seed))
182
+ effective_negative_prompt = negative_prompt # Use provided negative prompt if any
 
 
 
183
  model_name, weight_name, adapter_name = LORA_OPTIONS[lora_model]
184
+ pipe.set_adapters(adapter_name)
185
+ outputs = pipe(
186
+ prompt=prompt,
187
+ negative_prompt=effective_negative_prompt,
188
+ width=width,
189
+ height=height,
190
+ guidance_scale=guidance_scale,
191
+ num_inference_steps=28,
192
+ num_images_per_prompt=1,
193
+ cross_attention_kwargs={"scale": 0.65},
194
+ output_type="pil",
 
 
195
  )
196
+ images = outputs.images
197
+ image_paths = [save_image(img) for img in images]
198
  return image_paths, seed
199
 
200
+ # -----------------------
201
+ # Chat Generation Function (Text-only)
202
+ # -----------------------
203
+ def generate_chat(input_text: str, chat_history: list, max_new_tokens: int, temperature: float, top_p: float, top_k: int, repetition_penalty: float):
204
+ conversation = clean_chat_history(chat_history)
205
+ conversation.append({"role": "user", "content": input_text})
206
+ input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
208
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
209
  input_ids = input_ids.to(model.device)
210
+ streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
211
+ generation_kwargs = {
212
+ "input_ids": input_ids,
213
+ "streamer": streamer,
214
+ "max_new_tokens": max_new_tokens,
215
+ "do_sample": True,
216
+ "top_p": top_p,
217
+ "top_k": top_k,
218
+ "temperature": temperature,
219
+ "num_beams": 1,
220
+ "repetition_penalty": repetition_penalty,
221
+ }
222
+ t = Thread(target=model.generate, kwargs=generation_kwargs)
223
+ t.start()
224
+ outputs = []
225
+ for new_text in streamer:
226
+ outputs.append(new_text)
227
+ final_response = "".join(outputs)
228
+ chat_history.append({"role": "assistant", "content": final_response})
229
+ return chat_history
230
+
231
+ # -----------------------
232
+ # Qwen2 VL OCR Function (Multimodal)
233
+ # -----------------------
234
+ def generate_ocr(text: str, files, max_new_tokens: int):
235
+ if files:
236
+ if isinstance(files, list) and len(files) > 1:
237
+ images = [load_image(image) for image in files]
238
+ elif isinstance(files, list) and len(files) == 1:
239
+ images = [load_image(files[0])]
240
+ else:
241
+ images = [load_image(files)]
242
+ messages = [{
243
+ "role": "user",
244
+ "content": [*([{"type": "image", "image": image} for image in images]),
245
+ {"type": "text", "text": text}]
246
+ }]
247
+ prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
248
+ inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
249
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
250
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
251
+ thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
252
+ thread.start()
253
+ buffer = ""
254
+ for new_text in streamer:
255
+ buffer += new_text
256
+ return buffer
257
+ else:
258
+ return "No images provided."
259
+
260
+ # -----------------------
261
+ # Text-to-Speech Function
262
+ # -----------------------
263
+ def generate_tts(text: str, voice: str):
264
+ output_file = asyncio.run(text_to_speech(text, voice))
265
+ return output_file
266
 
267
+ # -----------------------
268
+ # Gradio Interface with Tabs
269
+ # -----------------------
270
+ with gr.Blocks(css=css, title="Gen Vision") as demo:
271
+ gr.Markdown(DESCRIPTION)
272
 
273
+ with gr.Tab("Chat Interface"):
 
 
 
 
274
  with gr.Row():
275
+ chat_history = gr.Chatbot(label="Chat History")
 
 
276
  with gr.Row():
277
+ chat_input = gr.Textbox(placeholder="Enter your message", label="Your Message")
 
278
  with gr.Row():
279
+ max_new_tokens_slider = gr.Slider(label="Max New Tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
280
+ temperature_slider = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
281
  with gr.Row():
282
+ top_p_slider = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
283
+ top_k_slider = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
284
+ repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
285
+ send_btn = gr.Button("Send")
286
+ send_btn.click(
287
+ fn=generate_chat,
288
+ inputs=[chat_input, chat_history, max_new_tokens_slider, temperature_slider, top_p_slider, top_k_slider, repetition_penalty_slider],
289
+ outputs=chat_history,
 
 
290
  )
291
 
292
+ with gr.Tab("Image Generation"):
293
+ image_prompt = gr.Textbox(label="Prompt", placeholder="Enter image prompt")
294
+ negative_prompt = gr.Textbox(label="Negative Prompt", placeholder="Enter negative prompt")
295
+ seed_input = gr.Number(label="Seed", value=0)
296
+ width_slider = gr.Slider(label="Width", minimum=256, maximum=2048, step=64, value=1024)
297
+ height_slider = gr.Slider(label="Height", minimum=256, maximum=2048, step=64, value=1024)
298
+ guidance_scale_slider = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=10.0, step=0.1, value=3.0)
299
+ randomize_checkbox = gr.Checkbox(label="Randomize Seed", value=True)
300
+ lora_dropdown = gr.Dropdown(label="LoRA Style", choices=list(LORA_OPTIONS.keys()), value="Realism")
301
+ generate_img_btn = gr.Button("Generate Image")
302
+ img_output = gr.Image(label="Generated Image")
303
+ seed_output = gr.Number(label="Used Seed")
304
+ generate_img_btn.click(
305
+ fn=generate_image,
306
+ inputs=[image_prompt, negative_prompt, seed_input, width_slider, height_slider, guidance_scale_slider, randomize_checkbox, lora_dropdown],
307
+ outputs=[img_output, seed_output],
308
  )
309
 
310
+ with gr.Tab("Qwen 2 VL OCR"):
311
+ ocr_text = gr.Textbox(label="Text Prompt", placeholder="Enter prompt for OCR")
312
+ file_input = gr.File(label="Upload Images", file_count="multiple")
313
+ ocr_max_new_tokens = gr.Slider(label="Max New Tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
314
+ ocr_btn = gr.Button("Run OCR")
315
+ ocr_output = gr.Textbox(label="OCR Output")
316
+ ocr_btn.click(
317
+ fn=generate_ocr,
318
+ inputs=[ocr_text, file_input, ocr_max_new_tokens],
319
+ outputs=ocr_output,
 
 
 
 
320
  )
321
 
322
+ with gr.Tab("Text-to-Speech"):
323
+ tts_text = gr.Textbox(label="Text", placeholder="Enter text for TTS")
324
+ voice_dropdown = gr.Dropdown(label="Voice", choices=TTS_VOICES, value=TTS_VOICES[0])
325
+ tts_btn = gr.Button("Generate Audio")
326
+ tts_audio = gr.Audio(label="Audio Output", type="filepath")
327
+ tts_btn.click(
328
+ fn=generate_tts,
329
+ inputs=[tts_text, voice_dropdown],
330
+ outputs=tts_audio,
331
+ )
332
 
333
+ demo.queue(max_size=20).launch(share=True)