ginipick commited on
Commit
3b05042
·
verified ·
1 Parent(s): 9552e26

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +486 -164
app.py CHANGED
@@ -1,7 +1,8 @@
1
-
2
  import os
3
 
4
- os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
 
 
5
 
6
  import gradio as gr
7
  import torch
@@ -14,12 +15,32 @@ import spaces
14
 
15
  from PIL import Image
16
  from diffusers import AutoencoderKLHunyuanVideo
17
- from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
18
- from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
19
- from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
 
 
 
 
 
 
 
 
 
 
 
20
  from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
21
  from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
22
- from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
 
 
 
 
 
 
 
 
 
23
  from diffusers_helper.thread_utils import AsyncStream, async_run
24
  from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
25
  from transformers import SiglipImageProcessor, SiglipVisionModel
@@ -33,16 +54,44 @@ high_vram = free_mem_gb > 60
33
  print(f'Free VRAM {free_mem_gb} GB')
34
  print(f'High-VRAM Mode: {high_vram}')
35
 
36
- text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
37
- text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
38
- tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
39
- tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
40
- vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
41
-
42
- feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
43
- image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
44
-
45
- transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  vae.eval()
48
  text_encoder.eval()
@@ -86,39 +135,41 @@ outputs_folder = './outputs/'
86
  os.makedirs(outputs_folder, exist_ok=True)
87
 
88
  examples = [
89
- ["img_examples/1.png", "The girl dances gracefully, with clear movements, full of charm.",],
90
- ["img_examples/2.jpg", "The man dances flamboyantly, swinging his hips and striking bold poses with dramatic flair."],
91
- ["img_examples/3.png", "The woman dances elegantly among the blossoms, spinning slowly with flowing sleeves and graceful hand movements."],
92
  ]
93
 
94
  def generate_examples(input_image, prompt):
95
-
96
- t2v=False
97
  n_prompt=""
98
  seed=31337
99
- total_second_length=5
100
- latent_window_size=9
101
- steps=25
102
- cfg=1.0
103
- gs=10.0
104
  rs=0.0
105
- gpu_memory_preservation=6
106
- use_teacache=True
107
  mp4_crf=16
108
 
109
  global stream
110
-
111
- # assert input_image is not None, 'No input image!'
112
  if t2v:
113
  default_height, default_width = 640, 640
114
  input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
115
- print("No input image provided. Using a blank white image.")
116
 
117
  yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
118
 
119
  stream = AsyncStream()
120
 
121
- async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
 
 
 
 
122
 
123
  output_filename = None
124
 
@@ -127,68 +178,111 @@ def generate_examples(input_image, prompt):
127
 
128
  if flag == 'file':
129
  output_filename = data
130
- yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
 
 
 
 
 
 
 
131
 
132
  if flag == 'progress':
133
  preview, desc, html = data
134
- yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
 
 
 
 
 
 
135
 
136
  if flag == 'end':
137
- yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
 
 
 
 
 
 
 
138
  break
139
 
140
-
141
-
142
  @torch.no_grad()
143
- def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
 
 
 
 
 
144
  total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
145
  total_latent_sections = int(max(round(total_latent_sections), 1))
146
 
147
  job_id = generate_timestamp()
148
 
149
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
 
 
150
 
151
  try:
152
- # Clean GPU
153
  if not high_vram:
154
  unload_complete_models(
155
  text_encoder, text_encoder_2, image_encoder, vae, transformer
156
  )
157
 
158
  # Text encoding
159
-
160
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
 
161
 
162
  if not high_vram:
163
- fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
164
  load_model_as_complete(text_encoder_2, target_device=gpu)
165
 
166
- llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
 
 
 
167
 
168
  if cfg == 1:
169
- llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
 
 
 
170
  else:
171
- llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
 
 
 
172
 
173
  llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
174
  llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
175
 
176
  # Processing input image
177
-
178
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
 
179
 
180
  H, W, C = input_image.shape
181
  height, width = find_nearest_bucket(H, W, resolution=640)
182
- input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
 
 
 
 
183
 
184
- Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
 
 
185
 
186
  input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
187
  input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
188
 
189
  # VAE encoding
190
-
191
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
 
192
 
193
  if not high_vram:
194
  load_model_as_complete(vae, target_device=gpu)
@@ -196,33 +290,42 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
196
  start_latent = vae_encode(input_image_pt, vae)
197
 
198
  # CLIP Vision
199
-
200
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
 
201
 
202
  if not high_vram:
203
  load_model_as_complete(image_encoder, target_device=gpu)
204
 
205
- image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
 
 
206
  image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
207
 
208
  # Dtype
209
-
210
  llama_vec = llama_vec.to(transformer.dtype)
211
  llama_vec_n = llama_vec_n.to(transformer.dtype)
212
  clip_l_pooler = clip_l_pooler.to(transformer.dtype)
213
  clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
214
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
215
 
216
- # Sampling
217
-
218
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
 
219
 
220
  rnd = torch.Generator("cpu").manual_seed(seed)
221
 
222
- history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
 
 
 
223
  history_pixels = None
224
 
225
- history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
 
 
 
226
  total_generated_latent_frames = 1
227
 
228
  for section_index in range(total_latent_sections):
@@ -234,7 +337,10 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
234
 
235
  if not high_vram:
236
  unload_complete_models()
237
- move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
 
 
 
238
 
239
  if use_teacache:
240
  transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
@@ -244,9 +350,11 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
244
  def callback(d):
245
  preview = d['denoised']
246
  preview = vae_decode_fake(preview)
247
-
248
  preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
249
- preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
 
 
 
250
 
251
  if stream.input_queue.top() == 'end':
252
  stream.output_queue.push(('end', None))
@@ -255,16 +363,35 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
255
  current_step = d['i'] + 1
256
  percentage = int(100.0 * current_step / steps)
257
  hint = f'Sampling {current_step}/{steps}'
258
- desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
259
- stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
 
 
260
  return
261
 
262
- indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
263
- clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
264
- clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
- clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
267
- clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
 
 
 
 
 
268
 
269
  generated_latents = sample_hunyuan(
270
  transformer=transformer,
@@ -275,7 +402,6 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
275
  real_guidance_scale=cfg,
276
  distilled_guidance_scale=gs,
277
  guidance_rescale=rs,
278
- # shift=3.0,
279
  num_inference_steps=steps,
280
  generator=rnd,
281
  prompt_embeds=llama_vec,
@@ -298,13 +424,21 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
298
  )
299
 
300
  total_generated_latent_frames += int(generated_latents.shape[2])
301
- history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
 
 
 
302
 
303
  if not high_vram:
304
- offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
 
 
 
305
  load_model_as_complete(vae, target_device=gpu)
306
 
307
- real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
 
 
308
 
309
  if history_pixels is None:
310
  history_pixels = vae_decode(real_history_latents, vae).cpu()
@@ -312,22 +446,33 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
312
  section_latent_frames = latent_window_size * 2
313
  overlapped_frames = latent_window_size * 4 - 3
314
 
315
- current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
316
- history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
 
 
 
 
317
 
318
  if not high_vram:
319
  unload_complete_models()
320
 
321
- output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
 
 
322
 
323
- save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
 
 
 
324
 
325
- print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
 
 
326
 
327
  stream.output_queue.push(('file', output_filename))
 
328
  except:
329
  traceback.print_exc()
330
-
331
  if not high_vram:
332
  unload_complete_models(
333
  text_encoder, text_encoder_2, image_encoder, vae, transformer
@@ -336,62 +481,62 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
336
  stream.output_queue.push(('end', None))
337
  return
338
 
339
- def get_duration(input_image, prompt, t2v, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
 
 
 
 
340
  return total_second_length * 60
341
 
342
  @spaces.GPU(duration=get_duration)
343
- def process(input_image, prompt,
344
- t2v=False,
345
- n_prompt="",
346
- seed=31337,
347
- total_second_length=5,
348
- latent_window_size=9,
349
- steps=25,
350
- cfg=1.0,
351
- gs=10.0,
352
- rs=0.0,
353
- gpu_memory_preservation=6,
354
- use_teacache=True,
355
- mp4_crf=16
356
- ):
357
  global stream
358
-
359
- # assert input_image is not None, 'No input image!'
360
  if t2v:
361
  default_height, default_width = 640, 640
362
- input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
 
 
 
363
  print("No input image provided. Using a blank white image.")
364
  else:
365
- composite_rgba_uint8 = input_image["composite"]
 
366
 
367
- # rgb_uint8 will be (H, W, 3), dtype uint8
368
  rgb_uint8 = composite_rgba_uint8[:, :, :3]
369
- # mask_uint8 will be (H, W), dtype uint8
370
  mask_uint8 = composite_rgba_uint8[:, :, 3]
371
-
372
- # Create background
373
  h, w = rgb_uint8.shape[:2]
374
- # White background, (H, W, 3), dtype uint8
375
- background_uint8 = np.full((h, w, 3), 255, dtype=np.uint8)
376
-
377
- # Normalize mask to range [0.0, 1.0].
378
  alpha_normalized_float32 = mask_uint8.astype(np.float32) / 255.0
379
-
380
- # Expand alpha to 3 channels to match RGB images for broadcasting.
381
- # alpha_mask_float32 will have shape (H, W, 3)
382
- alpha_mask_float32 = np.stack([alpha_normalized_float32] * 3, axis=2)
383
-
384
- # alpha blending
385
- blended_image_float32 = rgb_uint8.astype(np.float32) * alpha_mask_float32 + \
386
- background_uint8.astype(np.float32) * (1.0 - alpha_mask_float32)
387
 
388
  input_image = np.clip(blended_image_float32, 0, 255).astype(np.uint8)
389
-
390
  yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
391
 
392
  stream = AsyncStream()
393
 
394
- async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
 
 
 
 
395
 
396
  output_filename = None
397
 
@@ -400,88 +545,265 @@ def process(input_image, prompt,
400
 
401
  if flag == 'file':
402
  output_filename = data
403
- yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
 
 
 
 
 
 
 
404
 
405
- if flag == 'progress':
406
  preview, desc, html = data
407
- yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
 
 
 
 
 
 
408
 
409
- if flag == 'end':
410
- yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
 
 
 
 
 
 
 
411
  break
412
 
413
-
414
  def end_process():
415
  stream.input_queue.push('end')
416
 
417
-
418
  quick_prompts = [
419
  'The girl dances gracefully, with clear movements, full of charm.',
420
- 'A character doing some simple body movements.',
421
  ]
422
  quick_prompts = [[x] for x in quick_prompts]
423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
 
425
- css = make_progress_bar_css()
426
  block = gr.Blocks(css=css).queue()
427
  with block:
428
- gr.Markdown('# FramePack-F1')
429
- gr.Markdown(f"""### Video diffusion, but feels like image diffusion
430
- *FramePack F1 - a FramePack model that only predicts future frames from history frames*
431
- ### *beta* FramePack Fill 🖋️- draw a mask over the input image to inpaint the video output
432
- adapted from the officical code repo [FramePack](https://github.com/lllyasviel/FramePack) by [lllyasviel](lllyasviel/FramePack_F1_I2V_HY_20250503) and [FramePack Studio](https://github.com/colinurbs/FramePack-Studio) 🙌🏻
 
 
 
 
 
433
  """)
 
434
  with gr.Row():
435
  with gr.Column():
436
- input_image = gr.ImageEditor(type="numpy", label="Image", height=320, brush=gr.Brush(colors=["#ffffff"]))
 
 
 
 
 
437
  prompt = gr.Textbox(label="Prompt", value='')
438
- t2v = gr.Checkbox(label="do text-to-video", value=False)
439
- example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Quick List', samples_per_page=1000, components=[prompt])
440
- example_quick_prompts.click(lambda x: x[0], inputs=[example_quick_prompts], outputs=prompt, show_progress=False, queue=False)
441
 
442
- with gr.Row():
443
- start_button = gr.Button(value="Start Generation")
444
- end_button = gr.Button(value="End Generation", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
 
446
- total_second_length = gr.Slider(label="Total Video Length (Seconds)", minimum=1, maximum=5, value=2, step=0.1)
447
  with gr.Group():
448
- with gr.Accordion("Advanced settings", open=False):
449
- use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.')
450
-
451
- n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False) # Not used
 
 
 
452
  seed = gr.Number(label="Seed", value=31337, precision=0)
453
-
454
-
455
- latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, visible=False) # Should not change
456
- steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Changing this value is not recommended.')
457
-
458
- cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=False) # Should not change
459
- gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended.')
460
- rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change
461
-
462
- gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
463
-
464
- mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
 
466
  with gr.Column():
467
- preview_image = gr.Image(label="Next Latents", height=200, visible=False)
468
- result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
 
469
  progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
470
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
471
 
472
- gr.HTML('<div style="text-align:center; margin-top:20px;">Share your results and find ideas at the <a href="https://x.com/search?q=framepack&f=live" target="_blank">FramePack Twitter (X) thread</a></div>')
 
 
 
 
 
473
 
474
- ips = [input_image, prompt, t2v, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
475
- start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
 
 
 
 
 
 
 
 
 
 
476
  end_button.click(fn=end_process)
477
 
 
478
  # gr.Examples(
479
- # examples,
480
  # inputs=[input_image, prompt],
481
  # outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
482
  # fn=generate_examples,
483
  # cache_examples=True
484
- # )
485
-
486
 
487
  block.launch(share=True)
 
 
1
  import os
2
 
3
+ os.environ['HF_HOME'] = os.path.abspath(
4
+ os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download'))
5
+ )
6
 
7
  import gradio as gr
8
  import torch
 
15
 
16
  from PIL import Image
17
  from diffusers import AutoencoderKLHunyuanVideo
18
+ from transformers import (
19
+ LlamaModel, CLIPTextModel,
20
+ LlamaTokenizerFast, CLIPTokenizer
21
+ )
22
+ from diffusers_helper.hunyuan import (
23
+ encode_prompt_conds, vae_decode,
24
+ vae_encode, vae_decode_fake
25
+ )
26
+ from diffusers_helper.utils import (
27
+ save_bcthw_as_mp4, crop_or_pad_yield_mask,
28
+ soft_append_bcthw, resize_and_center_crop,
29
+ state_dict_weighted_merge, state_dict_offset_merge,
30
+ generate_timestamp
31
+ )
32
  from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
33
  from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
34
+ from diffusers_helper.memory import (
35
+ cpu, gpu,
36
+ get_cuda_free_memory_gb,
37
+ move_model_to_device_with_memory_preservation,
38
+ offload_model_from_device_for_memory_preservation,
39
+ fake_diffusers_current_device,
40
+ DynamicSwapInstaller,
41
+ unload_complete_models,
42
+ load_model_as_complete
43
+ )
44
  from diffusers_helper.thread_utils import AsyncStream, async_run
45
  from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
46
  from transformers import SiglipImageProcessor, SiglipVisionModel
 
54
  print(f'Free VRAM {free_mem_gb} GB')
55
  print(f'High-VRAM Mode: {high_vram}')
56
 
57
+ text_encoder = LlamaModel.from_pretrained(
58
+ "hunyuanvideo-community/HunyuanVideo",
59
+ subfolder='text_encoder',
60
+ torch_dtype=torch.float16
61
+ ).cpu()
62
+ text_encoder_2 = CLIPTextModel.from_pretrained(
63
+ "hunyuanvideo-community/HunyuanVideo",
64
+ subfolder='text_encoder_2',
65
+ torch_dtype=torch.float16
66
+ ).cpu()
67
+ tokenizer = LlamaTokenizerFast.from_pretrained(
68
+ "hunyuanvideo-community/HunyuanVideo",
69
+ subfolder='tokenizer'
70
+ )
71
+ tokenizer_2 = CLIPTokenizer.from_pretrained(
72
+ "hunyuanvideo-community/HunyuanVideo",
73
+ subfolder='tokenizer_2'
74
+ )
75
+ vae = AutoencoderKLHunyuanVideo.from_pretrained(
76
+ "hunyuanvideo-community/HunyuanVideo",
77
+ subfolder='vae',
78
+ torch_dtype=torch.float16
79
+ ).cpu()
80
+
81
+ feature_extractor = SiglipImageProcessor.from_pretrained(
82
+ "lllyasviel/flux_redux_bfl",
83
+ subfolder='feature_extractor'
84
+ )
85
+ image_encoder = SiglipVisionModel.from_pretrained(
86
+ "lllyasviel/flux_redux_bfl",
87
+ subfolder='image_encoder',
88
+ torch_dtype=torch.float16
89
+ ).cpu()
90
+
91
+ transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained(
92
+ 'lllyasviel/FramePack_F1_I2V_HY_20250503',
93
+ torch_dtype=torch.bfloat16
94
+ ).cpu()
95
 
96
  vae.eval()
97
  text_encoder.eval()
 
135
  os.makedirs(outputs_folder, exist_ok=True)
136
 
137
  examples = [
138
+ ["img_examples/1.png", "The girl dances gracefully, with clear movements, full of charm."],
139
+ ["img_examples/2.jpg", "The man dances flamboyantly, swinging his hips and striking bold poses with dramatic flair."],
140
+ ["img_examples/3.png", "The woman dances elegantly among the blossoms, spinning slowly with flowing sleeves and graceful hand movements."]
141
  ]
142
 
143
  def generate_examples(input_image, prompt):
144
+ t2v=False
 
145
  n_prompt=""
146
  seed=31337
147
+ total_second_length=5
148
+ latent_window_size=9
149
+ steps=25
150
+ cfg=1.0
151
+ gs=10.0
152
  rs=0.0
153
+ gpu_memory_preservation=6
154
+ use_teacache=True
155
  mp4_crf=16
156
 
157
  global stream
158
+
 
159
  if t2v:
160
  default_height, default_width = 640, 640
161
  input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
162
+ print("No input image provided. Using a blank white image.")
163
 
164
  yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
165
 
166
  stream = AsyncStream()
167
 
168
+ async_run(
169
+ worker, input_image, prompt, n_prompt, seed,
170
+ total_second_length, latent_window_size, steps,
171
+ cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf
172
+ )
173
 
174
  output_filename = None
175
 
 
178
 
179
  if flag == 'file':
180
  output_filename = data
181
+ yield (
182
+ output_filename,
183
+ gr.update(),
184
+ gr.update(),
185
+ gr.update(),
186
+ gr.update(interactive=False),
187
+ gr.update(interactive=True)
188
+ )
189
 
190
  if flag == 'progress':
191
  preview, desc, html = data
192
+ yield (
193
+ gr.update(),
194
+ gr.update(visible=True, value=preview),
195
+ desc, html,
196
+ gr.update(interactive=False),
197
+ gr.update(interactive=True)
198
+ )
199
 
200
  if flag == 'end':
201
+ yield (
202
+ output_filename,
203
+ gr.update(visible=False),
204
+ gr.update(),
205
+ '',
206
+ gr.update(interactive=True),
207
+ gr.update(interactive=False)
208
+ )
209
  break
210
 
 
 
211
  @torch.no_grad()
212
+ def worker(
213
+ input_image, prompt, n_prompt, seed,
214
+ total_second_length, latent_window_size,
215
+ steps, cfg, gs, rs,
216
+ gpu_memory_preservation, use_teacache, mp4_crf
217
+ ):
218
  total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
219
  total_latent_sections = int(max(round(total_latent_sections), 1))
220
 
221
  job_id = generate_timestamp()
222
 
223
+ stream.output_queue.push(
224
+ ('progress', (None, '', make_progress_bar_html(0, 'Starting ...')))
225
+ )
226
 
227
  try:
228
+ # Clean GPU if VRAM is low
229
  if not high_vram:
230
  unload_complete_models(
231
  text_encoder, text_encoder_2, image_encoder, vae, transformer
232
  )
233
 
234
  # Text encoding
235
+ stream.output_queue.push(
236
+ ('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...')))
237
+ )
238
 
239
  if not high_vram:
240
+ fake_diffusers_current_device(text_encoder, gpu)
241
  load_model_as_complete(text_encoder_2, target_device=gpu)
242
 
243
+ llama_vec, clip_l_pooler = encode_prompt_conds(
244
+ prompt, text_encoder, text_encoder_2,
245
+ tokenizer, tokenizer_2
246
+ )
247
 
248
  if cfg == 1:
249
+ llama_vec_n, clip_l_pooler_n = (
250
+ torch.zeros_like(llama_vec),
251
+ torch.zeros_like(clip_l_pooler)
252
+ )
253
  else:
254
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(
255
+ n_prompt, text_encoder, text_encoder_2,
256
+ tokenizer, tokenizer_2
257
+ )
258
 
259
  llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
260
  llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
261
 
262
  # Processing input image
263
+ stream.output_queue.push(
264
+ ('progress', (None, '', make_progress_bar_html(0, 'Image processing ...')))
265
+ )
266
 
267
  H, W, C = input_image.shape
268
  height, width = find_nearest_bucket(H, W, resolution=640)
269
+ input_image_np = resize_and_center_crop(
270
+ input_image,
271
+ target_width=width,
272
+ target_height=height
273
+ )
274
 
275
+ Image.fromarray(input_image_np).save(
276
+ os.path.join(outputs_folder, f'{job_id}.png')
277
+ )
278
 
279
  input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
280
  input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
281
 
282
  # VAE encoding
283
+ stream.output_queue.push(
284
+ ('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...')))
285
+ )
286
 
287
  if not high_vram:
288
  load_model_as_complete(vae, target_device=gpu)
 
290
  start_latent = vae_encode(input_image_pt, vae)
291
 
292
  # CLIP Vision
293
+ stream.output_queue.push(
294
+ ('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...')))
295
+ )
296
 
297
  if not high_vram:
298
  load_model_as_complete(image_encoder, target_device=gpu)
299
 
300
+ image_encoder_output = hf_clip_vision_encode(
301
+ input_image_np, feature_extractor, image_encoder
302
+ )
303
  image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
304
 
305
  # Dtype
 
306
  llama_vec = llama_vec.to(transformer.dtype)
307
  llama_vec_n = llama_vec_n.to(transformer.dtype)
308
  clip_l_pooler = clip_l_pooler.to(transformer.dtype)
309
  clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
310
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
311
 
312
+ # Start sampling
313
+ stream.output_queue.push(
314
+ ('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...')))
315
+ )
316
 
317
  rnd = torch.Generator("cpu").manual_seed(seed)
318
 
319
+ history_latents = torch.zeros(
320
+ size=(1, 16, 16 + 2 + 1, height // 8, width // 8),
321
+ dtype=torch.float32
322
+ ).cpu()
323
  history_pixels = None
324
 
325
+ history_latents = torch.cat(
326
+ [history_latents, start_latent.to(history_latents)],
327
+ dim=2
328
+ )
329
  total_generated_latent_frames = 1
330
 
331
  for section_index in range(total_latent_sections):
 
337
 
338
  if not high_vram:
339
  unload_complete_models()
340
+ move_model_to_device_with_memory_preservation(
341
+ transformer, target_device=gpu,
342
+ preserved_memory_gb=gpu_memory_preservation
343
+ )
344
 
345
  if use_teacache:
346
  transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
 
350
  def callback(d):
351
  preview = d['denoised']
352
  preview = vae_decode_fake(preview)
 
353
  preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
354
+ preview = einops.rearrange(
355
+ preview,
356
+ 'b c t h w -> (b h) (t w) c'
357
+ )
358
 
359
  if stream.input_queue.top() == 'end':
360
  stream.output_queue.push(('end', None))
 
363
  current_step = d['i'] + 1
364
  percentage = int(100.0 * current_step / steps)
365
  hint = f'Sampling {current_step}/{steps}'
366
+ desc = f'Section {section_index+1}/{total_latent_sections}'
367
+ stream.output_queue.push(
368
+ ('progress', (preview, desc, make_progress_bar_html(percentage, hint)))
369
+ )
370
  return
371
 
372
+ indices = torch.arange(
373
+ 0,
374
+ sum([1, 16, 2, 1, latent_window_size])
375
+ ).unsqueeze(0)
376
+ (
377
+ clean_latent_indices_start,
378
+ clean_latent_4x_indices,
379
+ clean_latent_2x_indices,
380
+ clean_latent_1x_indices,
381
+ latent_indices
382
+ ) = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
383
+ clean_latent_indices = torch.cat(
384
+ [clean_latent_indices_start, clean_latent_1x_indices],
385
+ dim=1
386
+ )
387
 
388
+ clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[
389
+ :, :, -sum([16, 2, 1]):, :, :
390
+ ].split([16, 2, 1], dim=2)
391
+ clean_latents = torch.cat(
392
+ [start_latent.to(history_latents), clean_latents_1x],
393
+ dim=2
394
+ )
395
 
396
  generated_latents = sample_hunyuan(
397
  transformer=transformer,
 
402
  real_guidance_scale=cfg,
403
  distilled_guidance_scale=gs,
404
  guidance_rescale=rs,
 
405
  num_inference_steps=steps,
406
  generator=rnd,
407
  prompt_embeds=llama_vec,
 
424
  )
425
 
426
  total_generated_latent_frames += int(generated_latents.shape[2])
427
+ history_latents = torch.cat(
428
+ [history_latents, generated_latents.to(history_latents)],
429
+ dim=2
430
+ )
431
 
432
  if not high_vram:
433
+ offload_model_from_device_for_memory_preservation(
434
+ transformer, target_device=gpu,
435
+ preserved_memory_gb=8
436
+ )
437
  load_model_as_complete(vae, target_device=gpu)
438
 
439
+ real_history_latents = history_latents[
440
+ :, :, -total_generated_latent_frames:, :, :
441
+ ]
442
 
443
  if history_pixels is None:
444
  history_pixels = vae_decode(real_history_latents, vae).cpu()
 
446
  section_latent_frames = latent_window_size * 2
447
  overlapped_frames = latent_window_size * 4 - 3
448
 
449
+ current_pixels = vae_decode(
450
+ real_history_latents[:, :, -section_latent_frames:], vae
451
+ ).cpu()
452
+ history_pixels = soft_append_bcthw(
453
+ history_pixels, current_pixels, overlapped_frames
454
+ )
455
 
456
  if not high_vram:
457
  unload_complete_models()
458
 
459
+ output_filename = os.path.join(
460
+ outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4'
461
+ )
462
 
463
+ save_bcthw_as_mp4(
464
+ history_pixels, output_filename,
465
+ fps=30, crf=mp4_crf
466
+ )
467
 
468
+ print(
469
+ f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}'
470
+ )
471
 
472
  stream.output_queue.push(('file', output_filename))
473
+
474
  except:
475
  traceback.print_exc()
 
476
  if not high_vram:
477
  unload_complete_models(
478
  text_encoder, text_encoder_2, image_encoder, vae, transformer
 
481
  stream.output_queue.push(('end', None))
482
  return
483
 
484
+ def get_duration(
485
+ input_image, prompt, t2v, n_prompt, seed,
486
+ total_second_length, latent_window_size, steps,
487
+ cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf
488
+ ):
489
  return total_second_length * 60
490
 
491
  @spaces.GPU(duration=get_duration)
492
+ def process(
493
+ input_image, prompt, t2v=False, n_prompt="",
494
+ seed=31337, total_second_length=5, latent_window_size=9,
495
+ steps=25, cfg=1.0, gs=10.0, rs=0.0,
496
+ gpu_memory_preservation=6, use_teacache=True, mp4_crf=16
497
+ ):
 
 
 
 
 
 
 
 
498
  global stream
499
+
 
500
  if t2v:
501
  default_height, default_width = 640, 640
502
+ input_image = np.ones(
503
+ (default_height, default_width, 3),
504
+ dtype=np.uint8
505
+ ) * 255
506
  print("No input image provided. Using a blank white image.")
507
  else:
508
+ # ImageEditor에서 받은 composite RGBA를 분리
509
+ composite_rgba_uint8 = input_image["composite"]
510
 
511
+ # rgb_uint8: (H,W,3)
512
  rgb_uint8 = composite_rgba_uint8[:, :, :3]
513
+ # mask_uint8: (H,W)
514
  mask_uint8 = composite_rgba_uint8[:, :, 3]
515
+
516
+ # 흰색 배경
517
  h, w = rgb_uint8.shape[:2]
518
+ background_uint8 = np.full((h, w, 3), 255, dtype=np.uint8)
519
+
520
+ # 알파 노멀라이즈
 
521
  alpha_normalized_float32 = mask_uint8.astype(np.float32) / 255.0
522
+ alpha_mask_float32 = np.stack([alpha_normalized_float32]*3, axis=2)
523
+
524
+ # 알파 블렌딩
525
+ blended_image_float32 = \
526
+ rgb_uint8.astype(np.float32) * alpha_mask_float32 + \
527
+ background_uint8.astype(np.float32) * (1.0 - alpha_mask_float32)
 
 
528
 
529
  input_image = np.clip(blended_image_float32, 0, 255).astype(np.uint8)
530
+
531
  yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
532
 
533
  stream = AsyncStream()
534
 
535
+ async_run(
536
+ worker, input_image, prompt, n_prompt, seed,
537
+ total_second_length, latent_window_size, steps,
538
+ cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf
539
+ )
540
 
541
  output_filename = None
542
 
 
545
 
546
  if flag == 'file':
547
  output_filename = data
548
+ yield (
549
+ output_filename,
550
+ gr.update(),
551
+ gr.update(),
552
+ gr.update(),
553
+ gr.update(interactive=False),
554
+ gr.update(interactive=True)
555
+ )
556
 
557
+ elif flag == 'progress':
558
  preview, desc, html = data
559
+ yield (
560
+ gr.update(),
561
+ gr.update(visible=True, value=preview),
562
+ desc, html,
563
+ gr.update(interactive=False),
564
+ gr.update(interactive=True)
565
+ )
566
 
567
+ elif flag == 'end':
568
+ yield (
569
+ output_filename,
570
+ gr.update(visible=False),
571
+ gr.update(),
572
+ '',
573
+ gr.update(interactive=True),
574
+ gr.update(interactive=False)
575
+ )
576
  break
577
 
 
578
  def end_process():
579
  stream.input_queue.push('end')
580
 
 
581
  quick_prompts = [
582
  'The girl dances gracefully, with clear movements, full of charm.',
583
+ 'A character doing some simple body movements.'
584
  ]
585
  quick_prompts = [[x] for x in quick_prompts]
586
 
587
+ # 기존 CSS + 추가로 UI 개선용
588
+ def make_custom_css():
589
+ base_progress_css = make_progress_bar_css()
590
+ # 아래는 예시로 약간 더 파스텔 톤의 스타일 및 카드형 UI
591
+ extra_css = """
592
+ body {
593
+ background: #fafbfe !important;
594
+ font-family: "Noto Sans", sans-serif;
595
+ }
596
+ #title-container {
597
+ text-align: center;
598
+ padding: 30px;
599
+ background: linear-gradient(135deg, #a8c0ff 0%, #fbc2eb 100%);
600
+ border-radius: 0 0 16px 16px;
601
+ margin-bottom: 20px;
602
+ }
603
+ #title-container h1 {
604
+ color: white;
605
+ font-size: 2.2rem;
606
+ margin: 0;
607
+ font-weight: 800;
608
+ text-shadow: 1px 2px 2px rgba(0,0,0,0.1);
609
+ }
610
+ .gr-panel {
611
+ background: #ffffffcc;
612
+ backdrop-filter: blur(4px);
613
+ border: 1px solid #dcdcf7;
614
+ border-radius: 12px;
615
+ padding: 16px;
616
+ margin-bottom: 8px;
617
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
618
+ }
619
+ .gr-box > label {
620
+ font-size: 0.9rem;
621
+ font-weight: 600;
622
+ color: #333;
623
+ }
624
+ .button-container button {
625
+ min-height: 48px;
626
+ font-size: 1rem;
627
+ font-weight: 600;
628
+ border-radius: 8px;
629
+ border: none !important;
630
+ }
631
+ .button-container button#start-button {
632
+ background-color: #4b9ffa !important;
633
+ color: #fff;
634
+ }
635
+ .button-container button#stop-button {
636
+ background-color: #ef5d84 !important;
637
+ color: #fff;
638
+ }
639
+ .button-container button:hover {
640
+ filter: brightness(0.97);
641
+ }
642
+ .no-generating-animation {
643
+ margin-top: 10px;
644
+ margin-bottom: 10px;
645
+ }
646
+ """
647
+ return base_progress_css + extra_css
648
+
649
+ css = make_custom_css()
650
 
 
651
  block = gr.Blocks(css=css).queue()
652
  with block:
653
+ # 상단 그라디언트 영역
654
+ with gr.Box(elem_id="title-container"):
655
+ gr.Markdown("<h1>FramePack I2V</h1>")
656
+
657
+ # 설명 부분
658
+ gr.Markdown("""
659
+ ### Video diffusion, but feels like image diffusion
660
+ FramePack I2V - a model that predicts future frames from history frames,
661
+ enabling you to generate short animations from a single image and a text prompt.<br><br>
662
+ ***beta FramePack Fill*** - You can also paint over the input image to inpaint the video output.
663
  """)
664
+
665
  with gr.Row():
666
  with gr.Column():
667
+ input_image = gr.ImageEditor(
668
+ type="numpy",
669
+ label="Image (click 'Brush' tool to mask)",
670
+ height=320,
671
+ brush=gr.Brush(colors=["#ffffff"])
672
+ )
673
  prompt = gr.Textbox(label="Prompt", value='')
 
 
 
674
 
675
+ t2v = gr.Checkbox(
676
+ label="Generate from Text Only (no image)?",
677
+ value=False
678
+ )
679
+ example_quick_prompts = gr.Dataset(
680
+ samples=quick_prompts,
681
+ label="Quick Prompt Picks",
682
+ samples_per_page=1000,
683
+ components=[prompt]
684
+ )
685
+ example_quick_prompts.click(
686
+ fn=lambda x: x[0],
687
+ inputs=[example_quick_prompts],
688
+ outputs=prompt,
689
+ show_progress=False,
690
+ queue=False
691
+ )
692
+
693
+ with gr.Row(elem_classes="button-container"):
694
+ start_button = gr.Button(value="Start Generation", elem_id="start-button")
695
+ end_button = gr.Button(value="Stop Generation", elem_id="stop-button", interactive=False)
696
+
697
+ total_second_length = gr.Slider(
698
+ label="Total Video Length (sec)",
699
+ minimum=1,
700
+ maximum=5,
701
+ value=2,
702
+ step=0.1
703
+ )
704
 
 
705
  with gr.Group():
706
+ with gr.Accordion("Advanced Settings", open=False):
707
+ use_teacache = gr.Checkbox(
708
+ label='Use TeaCache',
709
+ value=True,
710
+ info='Faster speed but can degrade finger/hand details'
711
+ )
712
+ n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False)
713
  seed = gr.Number(label="Seed", value=31337, precision=0)
714
+
715
+ latent_window_size = gr.Slider(
716
+ label="Latent Window Size",
717
+ minimum=1,
718
+ maximum=33,
719
+ value=9,
720
+ step=1,
721
+ visible=False
722
+ )
723
+ steps = gr.Slider(
724
+ label="Steps",
725
+ minimum=1,
726
+ maximum=100,
727
+ value=25,
728
+ step=1,
729
+ info='Not recommended to change significantly.'
730
+ )
731
+ cfg = gr.Slider(
732
+ label="CFG Scale",
733
+ minimum=1.0,
734
+ maximum=32.0,
735
+ value=1.0,
736
+ step=0.01,
737
+ visible=False
738
+ )
739
+ gs = gr.Slider(
740
+ label="Distilled CFG Scale",
741
+ minimum=1.0,
742
+ maximum=32.0,
743
+ value=10.0,
744
+ step=0.01,
745
+ info='Not recommended to change significantly.'
746
+ )
747
+ rs = gr.Slider(
748
+ label="CFG Re-Scale",
749
+ minimum=0.0,
750
+ maximum=1.0,
751
+ value=0.0,
752
+ step=0.01,
753
+ visible=False
754
+ )
755
+ gpu_memory_preservation = gr.Slider(
756
+ label="GPU Memory Preservation (GB)",
757
+ minimum=6,
758
+ maximum=128,
759
+ value=6,
760
+ step=0.1,
761
+ info="Increase if OOM occurs (slower speed)."
762
+ )
763
+ mp4_crf = gr.Slider(
764
+ label="MP4 Compression (CRF)",
765
+ minimum=0,
766
+ maximum=100,
767
+ value=16,
768
+ step=1,
769
+ info="Lower is higher quality. 16 is recommended."
770
+ )
771
 
772
  with gr.Column():
773
+ preview_image = gr.Image(label="Preview Latents", height=200, visible=False)
774
+ result_video = gr.Video(label="Generated Video", autoplay=True, height=512, loop=True)
775
+
776
  progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
777
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
778
 
779
+ gr.HTML("""
780
+ <div style="text-align:center; margin-top:20px;">
781
+ Share your creations or find inspiration by searching
782
+ <a href="https://x.com/search?q=framepack&f=live" target="_blank">#framepack</a> on Twitter (X)!
783
+ </div>
784
+ """)
785
 
786
+ # 함수 연결
787
+ ips = [
788
+ input_image, prompt, t2v, n_prompt, seed,
789
+ total_second_length, latent_window_size, steps,
790
+ cfg, gs, rs, gpu_memory_preservation,
791
+ use_teacache, mp4_crf
792
+ ]
793
+ start_button.click(
794
+ fn=process,
795
+ inputs=ips,
796
+ outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button]
797
+ )
798
  end_button.click(fn=end_process)
799
 
800
+ # 예제 버튼 (원한다면 주석 해제)
801
  # gr.Examples(
802
+ # examples=examples,
803
  # inputs=[input_image, prompt],
804
  # outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
805
  # fn=generate_examples,
806
  # cache_examples=True
807
+ # )
 
808
 
809
  block.launch(share=True)