ginipick commited on
Commit
9552e26
ยท
verified ยท
1 Parent(s): 7102dbd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +294 -1353
app.py CHANGED
@@ -1,13 +1,7 @@
1
- #############################################
2
- # from diffusers_helper.hf_login import login
3
- # ํ•„์š”์‹œ HF ๋กœ๊ทธ์ธ ์‚ฌ์šฉ (์ฃผ์„ ํ•ด์ œ ํ›„)
4
- #############################################
5
 
6
  import os
7
 
8
- os.environ['HF_HOME'] = os.path.abspath(
9
- os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download'))
10
- )
11
 
12
  import gradio as gr
13
  import torch
@@ -16,1028 +10,137 @@ import einops
16
  import safetensors.torch as sf
17
  import numpy as np
18
  import math
19
- import time
20
-
21
- # Hugging Face Spaces ํ™˜๊ฒฝ ์ธ์ง€ ํ™•์ธ
22
- IN_HF_SPACE = os.environ.get('SPACE_ID') is not None
23
-
24
- # --------- ๋ฒˆ์—ญ ๋”•์…”๋„ˆ๋ฆฌ(์˜์–ด ๊ณ ์ •) ---------
25
- translations = {
26
- "en": {
27
- "title": "FramePack - Image to Video Generation",
28
- "upload_image": "Upload Image",
29
- "prompt": "Prompt",
30
- "quick_prompts": "Quick Prompts",
31
- "start_generation": "Generate",
32
- "stop_generation": "Stop",
33
- "use_teacache": "Use TeaCache",
34
- "teacache_info": "Faster speed, but may result in slightly worse finger and hand generation.",
35
- "negative_prompt": "Negative Prompt",
36
- "seed": "Seed",
37
- # ์ตœ๋Œ€ 4์ดˆ๋กœ UI ํ‘œ๊ธฐ ์ˆ˜์ •
38
- "video_length": "Video Length (max 4 seconds)",
39
- "latent_window": "Latent Window Size",
40
- "steps": "Inference Steps",
41
- "steps_info": "Changing this value is not recommended.",
42
- "cfg_scale": "CFG Scale",
43
- "distilled_cfg": "Distilled CFG Scale",
44
- "distilled_cfg_info": "Changing this value is not recommended.",
45
- "cfg_rescale": "CFG Rescale",
46
- "gpu_memory": "GPU Memory Preservation (GB) (larger means slower)",
47
- "gpu_memory_info": "Set this to a larger value if you encounter OOM errors. Larger values cause slower speed.",
48
- "next_latents": "Next Latents",
49
- "generated_video": "Generated Video",
50
- "sampling_note": "Note: The model predicts future frames from past frames. If the start action isn't immediately visible, please wait for more frames.",
51
- "error_message": "Error",
52
- "processing_error": "Processing error",
53
- "network_error": "Network connection is unstable, model download timed out. Please try again later.",
54
- "memory_error": "GPU memory insufficient, please try increasing GPU memory preservation value or reduce video length.",
55
- "model_error": "Failed to load model, possibly due to network issues or high server load. Please try again later.",
56
- "partial_video": "Processing error, but partial video has been generated",
57
- "processing_interrupt": "Processing was interrupted, but partial video has been generated"
58
- }
59
- }
60
-
61
- def get_translation(key):
62
- return translations["en"].get(key, key)
63
-
64
- #############################################
65
- # diffusers_helper ๊ด€๋ จ ์ž„ํฌํŠธ
66
- #############################################
67
  from diffusers_helper.thread_utils import AsyncStream, async_run
68
  from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
69
- from diffusers_helper.memory import (
70
- cpu,
71
- gpu,
72
- get_cuda_free_memory_gb,
73
- move_model_to_device_with_memory_preservation,
74
- offload_model_from_device_for_memory_preservation,
75
- fake_diffusers_current_device,
76
- DynamicSwapInstaller,
77
- unload_complete_models,
78
- load_model_as_complete
79
- )
80
- from diffusers_helper.utils import (
81
- generate_timestamp,
82
- save_bcthw_as_mp4,
83
- resize_and_center_crop,
84
- crop_or_pad_yield_mask,
85
- soft_append_bcthw
86
- )
87
- from diffusers_helper.bucket_tools import find_nearest_bucket
88
- from diffusers_helper.hunyuan import (
89
- encode_prompt_conds, vae_encode, vae_decode, vae_decode_fake
90
- )
91
  from diffusers_helper.clip_vision import hf_clip_vision_encode
92
- from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
93
- from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
94
-
95
- from diffusers import AutoencoderKLHunyuanVideo
96
- from transformers import (
97
- LlamaModel, CLIPTextModel,
98
- LlamaTokenizerFast, CLIPTokenizer,
99
- SiglipVisionModel, SiglipImageProcessor
100
- )
101
-
102
- #############################################
103
- # GPU ์ฒดํฌ
104
- #############################################
105
- GPU_AVAILABLE = torch.cuda.is_available()
106
- free_mem_gb = 0.0
107
- high_vram = False
108
- if GPU_AVAILABLE:
109
- try:
110
- free_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
111
- high_vram = (free_mem_gb > 60)
112
- except:
113
- pass
114
- print(f"GPU Available: {GPU_AVAILABLE}, free_mem_gb={free_mem_gb}, high_vram={high_vram}")
115
-
116
- cpu_fallback_mode = not GPU_AVAILABLE
117
- last_update_time = time.time()
118
-
119
- #############################################
120
- # ๋ชจ๋ธ ๋กœ๋“œ (์ „์—ญ)
121
- #############################################
122
- text_encoder = None
123
- text_encoder_2 = None
124
- tokenizer = None
125
- tokenizer_2 = None
126
- vae = None
127
- feature_extractor = None
128
- image_encoder = None
129
- transformer = None
130
-
131
- # ์•„๋ž˜ ๋กœ์ง์€ ์งˆ๋ฌธ์— ์ œ์‹œ๋œ '๋‘ ๋ฒˆ์งธ ์ฝ”๋“œ'์˜ ๋ชจ๋ธ ๋กœ๋“œ ๋ถ€๋ถ„์„ ๊ฑฐ์˜ ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉ
132
- def load_global_models():
133
- global text_encoder, text_encoder_2, tokenizer, tokenizer_2
134
- global vae, feature_extractor, image_encoder, transformer
135
- global cpu_fallback_mode
136
-
137
- # ์ด๋ฏธ ๋กœ๋“œ๋˜์—ˆ์œผ๋ฉด ํŒจ์Šค
138
- if transformer is not None:
139
- return
140
-
141
- # GPU ๋ฉ”๋ชจ๋ฆฌ ์ •๋ณด
142
- device = gpu if GPU_AVAILABLE else cpu
143
-
144
- # diffusers_helper.memory.get_cuda_free_memory_gb(gpu)๋กœ ๋” ์ •ํ™•ํžˆ ๊ตฌํ•ด๋„ ๋จ
145
- print("Loading models...")
146
-
147
- # ======== ์‹ค ์ฝ”๋“œ: ๋‘ ๋ฒˆ์งธ ์˜ˆ์‹œ ๊ธฐ์ค€ =========
148
- # (1) ํ•˜์ด๋ธŒ๋ฆฌ๋“œ (if high_vram -> GPU๋กœ ๋กœ๋“œ, ์•„๋‹ˆ๋ฉด CPU + DynamicSwap)
149
-
150
- # ๋ฐ˜๋“œ์‹œ float16, bfloat16๋กœ ๋กœ๋“œ
151
- text_encoder_local = LlamaModel.from_pretrained(
152
- "hunyuanvideo-community/HunyuanVideo",
153
- subfolder='text_encoder',
154
- torch_dtype=torch.float16
155
- ).cpu()
156
-
157
- text_encoder_2_local = CLIPTextModel.from_pretrained(
158
- "hunyuanvideo-community/HunyuanVideo",
159
- subfolder='text_encoder_2',
160
- torch_dtype=torch.float16
161
- ).cpu()
162
-
163
- tokenizer_local = LlamaTokenizerFast.from_pretrained(
164
- "hunyuanvideo-community/HunyuanVideo",
165
- subfolder='tokenizer'
166
- )
167
- tokenizer_2_local = CLIPTokenizer.from_pretrained(
168
- "hunyuanvideo-community/HunyuanVideo",
169
- subfolder='tokenizer_2'
170
- )
171
-
172
- vae_local = AutoencoderKLHunyuanVideo.from_pretrained(
173
- "hunyuanvideo-community/HunyuanVideo",
174
- subfolder='vae',
175
- torch_dtype=torch.float16
176
- ).cpu()
177
-
178
- feature_extractor_local = SiglipImageProcessor.from_pretrained(
179
- "lllyasviel/flux_redux_bfl", subfolder='feature_extractor'
180
- )
181
- image_encoder_local = SiglipVisionModel.from_pretrained(
182
- "lllyasviel/flux_redux_bfl",
183
- subfolder='image_encoder',
184
- torch_dtype=torch.float16
185
- ).cpu()
186
-
187
- # FramePack_F1_I2V_HY_20250503 (bfloat16)
188
- transformer_local = HunyuanVideoTransformer3DModelPacked.from_pretrained(
189
- 'lllyasviel/FramePack_F1_I2V_HY_20250503',
190
- torch_dtype=torch.bfloat16
191
- ).cpu()
192
-
193
- # eval & dtype
194
- vae_local.eval()
195
- text_encoder_local.eval()
196
- text_encoder_2_local.eval()
197
- image_encoder_local.eval()
198
- transformer_local.eval()
199
-
200
- # VAE slicing for low VRAM
201
- if not high_vram:
202
- vae_local.enable_slicing()
203
- vae_local.enable_tiling()
204
-
205
- # ์˜คํ”„๋กœ๋“œ์šฉ
206
- transformer_local.high_quality_fp32_output_for_inference = True
207
- transformer_local.to(dtype=torch.bfloat16)
208
- vae_local.to(dtype=torch.float16)
209
- image_encoder_local.to(dtype=torch.float16)
210
- text_encoder_local.to(dtype=torch.float16)
211
- text_encoder_2_local.to(dtype=torch.float16)
212
-
213
- # requires_grad_(False)
214
- for m in [vae_local, text_encoder_local, text_encoder_2_local, image_encoder_local, transformer_local]:
215
- m.requires_grad_(False)
216
-
217
- # GPU ๋ชจ๋“œ & VRAM ๋งŽ์œผ๋ฉด ์ „๋ถ€ GPU
218
- # ๊ทธ๋ ‡์ง€ ์•Š์œผ๋ฉด DynamicSwap
219
- if GPU_AVAILABLE:
220
- if not high_vram:
221
- DynamicSwapInstaller.install_model(transformer_local, device=gpu)
222
- DynamicSwapInstaller.install_model(text_encoder_local, device=gpu)
223
- else:
224
- text_encoder_local.to(gpu)
225
- text_encoder_2_local.to(gpu)
226
- image_encoder_local.to(gpu)
227
- vae_local.to(gpu)
228
- transformer_local.to(gpu)
229
- else:
230
- cpu_fallback_mode = True
231
-
232
- # ๊ธ€๋กœ๋ฒŒ์— ํ• ๋‹น
233
- print("Model loaded.")
234
- text_encoder = text_encoder_local
235
- text_encoder_2 = text_encoder_2_local
236
- tokenizer = tokenizer_local
237
- tokenizer_2 = tokenizer_2_local
238
- vae = vae_local
239
- feature_extractor = feature_extractor_local
240
- image_encoder = image_encoder_local
241
- transformer = transformer_local
242
-
243
- #############################################
244
- # Worker ๋กœ์ง (๋‘ ๋ฒˆ์งธ ์ฝ”๋“œ) ๊ทธ๋Œ€๋กœ
245
- #############################################
246
- stream = AsyncStream()
247
-
248
- outputs_folder = './outputs/'
249
- os.makedirs(outputs_folder, exist_ok=True)
250
-
251
- @torch.no_grad()
252
- def worker(
253
- input_image, prompt, n_prompt, seed,
254
- total_second_length, latent_window_size, steps,
255
- cfg, gs, rs, gpu_memory_preservation, use_teacache
256
- ):
257
- """
258
- ์‹ค์ œ ์ƒ˜ํ”Œ๋ง ๋กœ์ง (๋‘ ๋ฒˆ์งธ ์ฝ”๋“œ ๊ธฐ๋ฐ˜)
259
- """
260
- load_global_models() # ๋ชจ๋ธ ๋กœ๋”ฉ
261
- global text_encoder, text_encoder_2, tokenizer, tokenizer_2
262
- global vae, feature_extractor, image_encoder, transformer
263
- global last_update_time
264
-
265
- # ์ตœ๋Œ€ 4์ดˆ๋กœ ๊ณ ์ •
266
- total_second_length = min(total_second_length, 4.0)
267
-
268
- total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
269
- total_latent_sections = int(max(round(total_latent_sections), 1))
270
-
271
- job_id = generate_timestamp()
272
-
273
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
274
-
275
- try:
276
- # GPU ์ ์„ ๊ฒฝ์šฐ Unload
277
- if not high_vram and GPU_AVAILABLE:
278
- unload_complete_models(
279
- text_encoder, text_encoder_2, image_encoder, vae, transformer
280
- )
281
-
282
- # Text encoding
283
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
284
-
285
- if not high_vram and GPU_AVAILABLE:
286
- fake_diffusers_current_device(text_encoder, gpu)
287
- load_model_as_complete(text_encoder_2, target_device=gpu)
288
-
289
- llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
290
- if cfg == 1.0:
291
- llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
292
- else:
293
- llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
294
-
295
- llama_vec, llama_mask = crop_or_pad_yield_mask(llama_vec, length=512)
296
- llama_vec_n, llama_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
297
-
298
- # Image processing
299
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
300
-
301
- H, W, C = input_image.shape
302
- height, width = find_nearest_bucket(H, W, resolution=640)
303
-
304
- if cpu_fallback_mode:
305
- height = min(height, 320)
306
- width = min(width, 320)
307
-
308
- input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
309
-
310
- Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
311
-
312
- input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
313
- input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
314
-
315
- # VAE encode
316
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
317
-
318
- if not high_vram and GPU_AVAILABLE:
319
- load_model_as_complete(vae, target_device=gpu)
320
- start_latent = vae_encode(input_image_pt, vae)
321
-
322
- # CLIP Vision
323
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
324
-
325
- if not high_vram and GPU_AVAILABLE:
326
- load_model_as_complete(image_encoder, target_device=gpu)
327
- image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
328
- image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
329
-
330
- # dtype
331
- llama_vec = llama_vec.to(transformer.dtype)
332
- llama_vec_n = llama_vec_n.to(transformer.dtype)
333
- clip_l_pooler = clip_l_pooler.to(transformer.dtype)
334
- clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
335
- image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
336
-
337
- # Start sampling
338
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
339
-
340
- rnd = torch.Generator("cpu").manual_seed(seed)
341
-
342
- # ์ดˆ๊ธฐ history latents
343
- history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
344
- history_pixels = None
345
-
346
- # start_latent ๋ถ™์ด๊ธฐ
347
- history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
348
- total_generated_latent_frames = 1
349
-
350
- for section_index in range(total_latent_sections):
351
- if stream.input_queue.top() == 'end':
352
- stream.output_queue.push(('end', None))
353
- return
354
-
355
- print(f'Section {section_index+1}/{total_latent_sections}')
356
-
357
- if not high_vram and GPU_AVAILABLE:
358
- unload_complete_models()
359
- move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
360
-
361
- # teacache
362
- if use_teacache:
363
- transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
364
- else:
365
- transformer.initialize_teacache(enable_teacache=False)
366
-
367
- def callback(d):
368
- preview = d['denoised']
369
- preview = vae_decode_fake(preview)
370
- preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
371
- preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
372
 
373
- if stream.input_queue.top() == 'end':
374
- stream.output_queue.push(('end', None))
375
- raise KeyboardInterrupt('User stops generation.')
376
 
377
- current_step = d['i'] + 1
378
- percentage = int(100.0 * current_step / steps)
379
- hint = f'Sampling {current_step}/{steps}'
380
- desc = f'Section {section_index+1}/{total_latent_sections}'
381
- stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
382
- return
383
 
384
- # indices
385
- frames_per_section = latent_window_size * 4 - 3
386
- indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
387
- (
388
- clean_latent_indices_start,
389
- clean_latent_4x_indices,
390
- clean_latent_2x_indices,
391
- clean_latent_1x_indices,
392
- latent_indices
393
- ) = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
394
 
395
- clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
 
 
 
 
396
 
397
- clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -19:, :, :].split([16, 2, 1], dim=2)
398
- clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
399
 
400
- try:
401
- generated_latents = sample_hunyuan(
402
- transformer=transformer,
403
- sampler='unipc',
404
- width=width,
405
- height=height,
406
- frames=frames_per_section,
407
- real_guidance_scale=cfg,
408
- distilled_guidance_scale=gs,
409
- guidance_rescale=rs,
410
- num_inference_steps=steps,
411
- generator=rnd,
412
- prompt_embeds=llama_vec,
413
- prompt_embeds_mask=llama_mask,
414
- prompt_poolers=clip_l_pooler,
415
- negative_prompt_embeds=llama_vec_n,
416
- negative_prompt_embeds_mask=llama_mask_n,
417
- negative_prompt_poolers=clip_l_pooler_n,
418
- device=gpu if GPU_AVAILABLE else cpu,
419
- dtype=torch.bfloat16,
420
- image_embeddings=image_encoder_last_hidden_state,
421
- latent_indices=latent_indices,
422
- clean_latents=clean_latents,
423
- clean_latent_indices=clean_latent_indices,
424
- clean_latents_2x=clean_latents_2x,
425
- clean_latent_2x_indices=clean_latent_2x_indices,
426
- clean_latents_4x=clean_latents_4x,
427
- clean_latent_4x_indices=clean_latent_4x_indices,
428
- callback=callback
429
- )
430
- except KeyboardInterrupt:
431
- print("User cancelled.")
432
- stream.output_queue.push(('end', None))
433
- return
434
- except Exception as e:
435
- traceback.print_exc()
436
- stream.output_queue.push(('end', None))
437
- return
438
 
439
- total_generated_latent_frames += generated_latents.shape[2]
440
- history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
 
 
 
441
 
442
- if not high_vram and GPU_AVAILABLE:
443
- offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
444
- load_model_as_complete(vae, target_device=gpu)
445
 
446
- real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
 
447
 
448
- if history_pixels is None:
449
- history_pixels = vae_decode(real_history_latents, vae).cpu()
450
- else:
451
- section_latent_frames = latent_window_size * 2
452
- overlapped_frames = frames_per_section
453
- current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
454
- history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
455
 
456
- if not high_vram and GPU_AVAILABLE:
457
- unload_complete_models()
 
 
 
458
 
459
- output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
460
- save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=16) # CRF=16
 
 
 
 
 
 
 
 
461
 
462
- stream.output_queue.push(('file', output_filename))
463
 
464
- except:
465
- traceback.print_exc()
466
- if not high_vram and GPU_AVAILABLE:
467
- unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
468
 
469
- stream.output_queue.push(('end', None))
470
- return
 
 
 
471
 
472
- def end_process():
473
- """
474
- ์ค‘๋‹จ ์š”์ฒญ
475
- """
476
- global stream
477
- stream.input_queue.push('end')
 
 
 
 
 
 
 
 
478
 
479
- # Gradio์—์„œ ์ด worker ํ•จ์ˆ˜๋ฅผ ๋น„๋™๊ธฐ๋กœ ํ˜ธ์ถœ
480
- def process(
481
- input_image, prompt, n_prompt, seed,
482
- total_second_length, latent_window_size, steps,
483
- cfg, gs, rs, gpu_memory_preservation, use_teacache
484
- ):
485
  global stream
486
- if input_image is None:
487
- raise ValueError("No input image provided.")
 
 
 
 
488
 
489
- yield None, None, "", "", gr.update(interactive=False), gr.update(interactive=True)
490
 
491
  stream = AsyncStream()
492
- async_run(
493
- worker,
494
- input_image, prompt, n_prompt, seed,
495
- total_second_length, latent_window_size, steps,
496
- cfg, gs, rs, gpu_memory_preservation, use_teacache
497
- )
498
 
499
  output_filename = None
500
- prev_filename = None
501
- error_message = None
502
 
503
  while True:
504
  flag, data = stream.output_queue.next()
 
505
  if flag == 'file':
506
  output_filename = data
507
- prev_filename = output_filename
508
- yield output_filename, gr.update(), gr.update(), "", gr.update(interactive=False), gr.update(interactive=True)
509
 
510
- elif flag == 'progress':
511
  preview, desc, html = data
512
  yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
513
 
514
- elif flag == 'error':
515
- error_message = data
516
- print(f"Error: {error_message}")
517
-
518
- elif flag == 'end':
519
- if output_filename is None and prev_filename:
520
- output_filename = prev_filename
521
- # ์—๋Ÿฌ๊ฐ€ ์žˆ์—ˆ์œผ๋ฉด ์—๋Ÿฌ ํ‘œ์‹œ
522
- if error_message:
523
- yield (
524
- output_filename, # ๋งˆ์ง€๋ง‰ ํŒŒ์ผ (๋˜๋Š” None)
525
- gr.update(visible=False),
526
- gr.update(),
527
- f"<div style='color:red;'>{error_message}</div>",
528
- gr.update(interactive=True),
529
- gr.update(interactive=False)
530
- )
531
- else:
532
- yield (
533
- output_filename, gr.update(visible=False), gr.update(), "", gr.update(interactive=True), gr.update(interactive=False)
534
- )
535
  break
536
 
537
- # UI CSS
538
- def make_custom_css():
539
- base_progress_css = make_progress_bar_css()
540
- pastel_css = """
541
- body {
542
- background: #faf9ff !important;
543
- font-family: "Noto Sans", sans-serif;
544
- }
545
- #app-container {
546
- max-width: 1200px;
547
- margin: 0 auto;
548
- padding: 1rem;
549
- position: relative;
550
- }
551
- #app-container h1 {
552
- color: #5F5AA2;
553
- margin-bottom: 1.2rem;
554
- font-weight: 700;
555
- text-shadow: 1px 1px 2px #bbb;
556
- }
557
- .gr-panel {
558
- background: #ffffffcc;
559
- border: 1px solid #e1dff0;
560
- border-radius: 8px;
561
- padding: 1rem;
562
- box-shadow: 0 1px 3px rgba(0,0,0,0.1);
563
- }
564
- .button-container button {
565
- min-height: 45px;
566
- font-size: 1rem;
567
- font-weight: 600;
568
- border-radius: 6px;
569
- }
570
- .button-container button#start-button {
571
- background-color: #A289E3 !important;
572
- color: #fff !important;
573
- border: 1px solid #a58de2;
574
- }
575
- .button-container button#stop-button {
576
- background-color: #F48A9B !important;
577
- color: #fff !important;
578
- border: 1px solid #f18fa0;
579
- }
580
- .button-container button:hover {
581
- filter: brightness(0.95);
582
- }
583
- .preview-container, .video-container {
584
- border: 1px solid #ded9f2;
585
- border-radius: 8px;
586
- overflow: hidden;
587
- }
588
- .progress-container {
589
- margin-top: 15px;
590
- margin-bottom: 15px;
591
- }
592
- .error-message {
593
- background-color: #FFF5F5;
594
- border: 1px solid #FED7D7;
595
- color: #E53E3E;
596
- padding: 10px;
597
- border-radius: 4px;
598
- margin-top: 10px;
599
- font-weight: 500;
600
- }
601
- @media (max-width: 768px) {
602
- #app-container {
603
- padding: 0.5rem;
604
- }
605
- .mobile-full-width {
606
- flex-direction: column !important;
607
- }
608
- .mobile-full-width > .gr-block {
609
- width: 100% !important;
610
- }
611
- }
612
- """
613
- return base_progress_css + pastel_css
614
-
615
- css = make_custom_css()
616
-
617
- # ์ƒ˜ํ”Œ ํ”„๋กฌํ”„ํŠธ
618
- quick_prompts = [
619
- ["The girl dances gracefully, with clear movements, full of charm."],
620
- ["A character doing some simple body movements."]
621
- ]
622
-
623
- # Gradio UI
624
- block = gr.Blocks(css=css).queue()
625
- with block:
626
- gr.HTML("<div id='app-container'><h1>FramePack - Image to Video Generation</h1></div>")
627
-
628
- with gr.Row(elem_classes="mobile-full-width"):
629
- # ์™ผ์ชฝ
630
- with gr.Column(scale=1, elem_classes="gr-panel"):
631
- input_image = gr.Image(
632
- label=get_translation("upload_image"),
633
- type="numpy",
634
- height=320
635
- )
636
- prompt = gr.Textbox(
637
- label=get_translation("prompt"),
638
- value=''
639
- )
640
-
641
- example_quick_prompts = gr.Dataset(
642
- samples=quick_prompts,
643
- label=get_translation("quick_prompts"),
644
- samples_per_page=1000,
645
- components=[prompt]
646
- )
647
- example_quick_prompts.click(
648
- fn=lambda x: x[0],
649
- inputs=[example_quick_prompts],
650
- outputs=prompt,
651
- show_progress=False,
652
- queue=False
653
- )
654
-
655
- # ์˜ค๋ฅธ์ชฝ
656
- with gr.Column(scale=1, elem_classes="gr-panel"):
657
- with gr.Row(elem_classes="button-container"):
658
- start_button = gr.Button(
659
- value=get_translation("start_generation"),
660
- elem_id="start-button",
661
- variant="primary"
662
- )
663
- stop_button = gr.Button(
664
- value=get_translation("stop_generation"),
665
- elem_id="stop-button",
666
- interactive=False
667
- )
668
-
669
- result_video = gr.Video(
670
- label=get_translation("generated_video"),
671
- autoplay=True,
672
- loop=True,
673
- height=320,
674
- elem_classes="video-container"
675
- )
676
- preview_image = gr.Image(
677
- label=get_translation("next_latents"),
678
- visible=False,
679
- height=150,
680
- elem_classes="preview-container"
681
- )
682
- gr.Markdown(get_translation("sampling_note"))
683
-
684
- with gr.Group(elem_classes="progress-container"):
685
- progress_desc = gr.Markdown('')
686
- progress_bar = gr.HTML('')
687
-
688
- error_message = gr.HTML('', visible=True)
689
-
690
- # Advanced
691
- with gr.Accordion("Advanced Settings", open=False, elem_classes="gr-panel"):
692
- use_teacache = gr.Checkbox(
693
- label=get_translation("use_teacache"),
694
- value=True,
695
- info=get_translation("teacache_info")
696
- )
697
- n_prompt = gr.Textbox(label=get_translation("negative_prompt"), value="", visible=False)
698
- seed = gr.Number(
699
- label=get_translation("seed"),
700
- value=31337,
701
- precision=0
702
- )
703
- # ๊ธฐ๋ณธ 2์ดˆ, ์ตœ๋Œ€ 4์ดˆ
704
- total_second_length = gr.Slider(
705
- label=get_translation("video_length"),
706
- minimum=1,
707
- maximum=4,
708
- value=2,
709
- step=0.1
710
- )
711
- latent_window_size = gr.Slider(
712
- label=get_translation("latent_window"),
713
- minimum=1,
714
- maximum=33,
715
- value=9,
716
- step=1,
717
- visible=False
718
- )
719
- steps = gr.Slider(
720
- label=get_translation("steps"),
721
- minimum=1,
722
- maximum=100,
723
- value=25,
724
- step=1,
725
- info=get_translation("steps_info")
726
- )
727
- cfg = gr.Slider(
728
- label=get_translation("cfg_scale"),
729
- minimum=1.0,
730
- maximum=32.0,
731
- value=1.0,
732
- step=0.01,
733
- visible=False
734
- )
735
- gs = gr.Slider(
736
- label=get_translation("distilled_cfg"),
737
- minimum=1.0,
738
- maximum=32.0,
739
- value=10.0,
740
- step=0.01,
741
- info=get_translation("distilled_cfg_info")
742
- )
743
- rs = gr.Slider(
744
- label=get_translation("cfg_rescale"),
745
- minimum=0.0,
746
- maximum=1.0,
747
- value=0.0,
748
- step=0.01,
749
- visible=False
750
- )
751
- gpu_memory_preservation = gr.Slider(
752
- label=get_translation("gpu_memory"),
753
- minimum=6,
754
- maximum=128,
755
- value=6,
756
- step=0.1,
757
- info=get_translation("gpu_memory_info")
758
- )
759
-
760
- # ๋ฒ„ํŠผ ์ฒ˜๋ฆฌ
761
- inputs_list = [
762
- input_image, prompt, n_prompt, seed,
763
- total_second_length, latent_window_size, steps,
764
- cfg, gs, rs, gpu_memory_preservation, use_teacache
765
- ]
766
- start_button.click(
767
- fn=process,
768
- inputs=inputs_list,
769
- outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, stop_button]
770
- )
771
- stop_button.click(fn=end_process)
772
-
773
- block.launch()
774
- #############################################
775
- # from diffusers_helper.hf_login import login
776
- # ํ•„์š”์‹œ HF ๋กœ๊ทธ์ธ ์‚ฌ์šฉ (์ฃผ์„ ํ•ด์ œ ํ›„)
777
- #############################################
778
-
779
- import os
780
-
781
- os.environ['HF_HOME'] = os.path.abspath(
782
- os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download'))
783
- )
784
-
785
- import gradio as gr
786
- import torch
787
- import traceback
788
- import einops
789
- import safetensors.torch as sf
790
- import numpy as np
791
- import math
792
- import time
793
-
794
- # Hugging Face Spaces ํ™˜๊ฒฝ ์ธ์ง€ ํ™•์ธ
795
- IN_HF_SPACE = os.environ.get('SPACE_ID') is not None
796
-
797
- # --------- ๋ฒˆ์—ญ ๋”•์…”๋„ˆ๋ฆฌ(์˜์–ด ๊ณ ์ •) ---------
798
- translations = {
799
- "en": {
800
- "title": "FramePack - Image to Video Generation",
801
- "upload_image": "Upload Image",
802
- "prompt": "Prompt",
803
- "quick_prompts": "Quick Prompts",
804
- "start_generation": "Generate",
805
- "stop_generation": "Stop",
806
- "use_teacache": "Use TeaCache",
807
- "teacache_info": "Faster speed, but may result in slightly worse finger and hand generation.",
808
- "negative_prompt": "Negative Prompt",
809
- "seed": "Seed",
810
- # ์ตœ๋Œ€ 4์ดˆ๋กœ UI ํ‘œ๊ธฐ ์ˆ˜์ •
811
- "video_length": "Video Length (max 4 seconds)",
812
- "latent_window": "Latent Window Size",
813
- "steps": "Inference Steps",
814
- "steps_info": "Changing this value is not recommended.",
815
- "cfg_scale": "CFG Scale",
816
- "distilled_cfg": "Distilled CFG Scale",
817
- "distilled_cfg_info": "Changing this value is not recommended.",
818
- "cfg_rescale": "CFG Rescale",
819
- "gpu_memory": "GPU Memory Preservation (GB) (larger means slower)",
820
- "gpu_memory_info": "Set this to a larger value if you encounter OOM errors. Larger values cause slower speed.",
821
- "next_latents": "Next Latents",
822
- "generated_video": "Generated Video",
823
- "sampling_note": "Note: The model predicts future frames from past frames. If the start action isn't immediately visible, please wait for more frames.",
824
- "error_message": "Error",
825
- "processing_error": "Processing error",
826
- "network_error": "Network connection is unstable, model download timed out. Please try again later.",
827
- "memory_error": "GPU memory insufficient, please try increasing GPU memory preservation value or reduce video length.",
828
- "model_error": "Failed to load model, possibly due to network issues or high server load. Please try again later.",
829
- "partial_video": "Processing error, but partial video has been generated",
830
- "processing_interrupt": "Processing was interrupted, but partial video has been generated"
831
- }
832
- }
833
-
834
- def get_translation(key):
835
- return translations["en"].get(key, key)
836
-
837
- #############################################
838
- # diffusers_helper ๊ด€๋ จ ์ž„ํฌํŠธ
839
- #############################################
840
- from diffusers_helper.thread_utils import AsyncStream, async_run
841
- from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
842
- from diffusers_helper.memory import (
843
- cpu,
844
- gpu,
845
- get_cuda_free_memory_gb,
846
- move_model_to_device_with_memory_preservation,
847
- offload_model_from_device_for_memory_preservation,
848
- fake_diffusers_current_device,
849
- DynamicSwapInstaller,
850
- unload_complete_models,
851
- load_model_as_complete
852
- )
853
- from diffusers_helper.utils import (
854
- generate_timestamp,
855
- save_bcthw_as_mp4,
856
- resize_and_center_crop,
857
- crop_or_pad_yield_mask,
858
- soft_append_bcthw
859
- )
860
- from diffusers_helper.bucket_tools import find_nearest_bucket
861
- from diffusers_helper.hunyuan import (
862
- encode_prompt_conds, vae_encode, vae_decode, vae_decode_fake
863
- )
864
- from diffusers_helper.clip_vision import hf_clip_vision_encode
865
- from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
866
- from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
867
-
868
- from diffusers import AutoencoderKLHunyuanVideo
869
- from transformers import (
870
- LlamaModel, CLIPTextModel,
871
- LlamaTokenizerFast, CLIPTokenizer,
872
- SiglipVisionModel, SiglipImageProcessor
873
- )
874
-
875
- #############################################
876
- # GPU ์ฒดํฌ
877
- #############################################
878
- GPU_AVAILABLE = torch.cuda.is_available()
879
- free_mem_gb = 0.0
880
- high_vram = False
881
- if GPU_AVAILABLE:
882
- try:
883
- free_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
884
- high_vram = (free_mem_gb > 60)
885
- except:
886
- pass
887
- print(f"GPU Available: {GPU_AVAILABLE}, free_mem_gb={free_mem_gb}, high_vram={high_vram}")
888
-
889
- cpu_fallback_mode = not GPU_AVAILABLE
890
- last_update_time = time.time()
891
-
892
- #############################################
893
- # ๋ชจ๋ธ ๋กœ๋“œ (์ „์—ญ)
894
- #############################################
895
- text_encoder = None
896
- text_encoder_2 = None
897
- tokenizer = None
898
- tokenizer_2 = None
899
- vae = None
900
- feature_extractor = None
901
- image_encoder = None
902
- transformer = None
903
-
904
- # ์•„๋ž˜ ๋กœ์ง์€ ์งˆ๋ฌธ์— ์ œ์‹œ๋œ '๋‘ ๋ฒˆ์งธ ์ฝ”๋“œ'์˜ ๋ชจ๋ธ ๋กœ๋“œ ๋ถ€๋ถ„์„ ๊ฑฐ์˜ ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉ
905
- def load_global_models():
906
- global text_encoder, text_encoder_2, tokenizer, tokenizer_2
907
- global vae, feature_extractor, image_encoder, transformer
908
- global cpu_fallback_mode
909
-
910
- # ์ด๋ฏธ ๋กœ๋“œ๋˜์—ˆ์œผ๋ฉด ํŒจ์Šค
911
- if transformer is not None:
912
- return
913
-
914
- # GPU ๋ฉ”๋ชจ๋ฆฌ ์ •๋ณด
915
- device = gpu if GPU_AVAILABLE else cpu
916
-
917
- # diffusers_helper.memory.get_cuda_free_memory_gb(gpu)๋กœ ๋” ์ •ํ™•ํžˆ ๊ตฌํ•ด๋„ ๋จ
918
- print("Loading models...")
919
-
920
- # ======== ์‹ค ์ฝ”๋“œ: ๋‘ ๋ฒˆ์งธ ์˜ˆ์‹œ ๊ธฐ์ค€ =========
921
- # (1) ํ•˜์ด๋ธŒ๋ฆฌ๋“œ (if high_vram -> GPU๋กœ ๋กœ๋“œ, ์•„๋‹ˆ๋ฉด CPU + DynamicSwap)
922
-
923
- # ๋ฐ˜๋“œ์‹œ float16, bfloat16๋กœ ๋กœ๋“œ
924
- text_encoder_local = LlamaModel.from_pretrained(
925
- "hunyuanvideo-community/HunyuanVideo",
926
- subfolder='text_encoder',
927
- torch_dtype=torch.float16
928
- ).cpu()
929
-
930
- text_encoder_2_local = CLIPTextModel.from_pretrained(
931
- "hunyuanvideo-community/HunyuanVideo",
932
- subfolder='text_encoder_2',
933
- torch_dtype=torch.float16
934
- ).cpu()
935
-
936
- tokenizer_local = LlamaTokenizerFast.from_pretrained(
937
- "hunyuanvideo-community/HunyuanVideo",
938
- subfolder='tokenizer'
939
- )
940
- tokenizer_2_local = CLIPTokenizer.from_pretrained(
941
- "hunyuanvideo-community/HunyuanVideo",
942
- subfolder='tokenizer_2'
943
- )
944
-
945
- vae_local = AutoencoderKLHunyuanVideo.from_pretrained(
946
- "hunyuanvideo-community/HunyuanVideo",
947
- subfolder='vae',
948
- torch_dtype=torch.float16
949
- ).cpu()
950
-
951
- feature_extractor_local = SiglipImageProcessor.from_pretrained(
952
- "lllyasviel/flux_redux_bfl", subfolder='feature_extractor'
953
- )
954
- image_encoder_local = SiglipVisionModel.from_pretrained(
955
- "lllyasviel/flux_redux_bfl",
956
- subfolder='image_encoder',
957
- torch_dtype=torch.float16
958
- ).cpu()
959
-
960
- # FramePack_F1_I2V_HY_20250503 (bfloat16)
961
- transformer_local = HunyuanVideoTransformer3DModelPacked.from_pretrained(
962
- 'lllyasviel/FramePack_F1_I2V_HY_20250503',
963
- torch_dtype=torch.bfloat16
964
- ).cpu()
965
-
966
- # eval & dtype
967
- vae_local.eval()
968
- text_encoder_local.eval()
969
- text_encoder_2_local.eval()
970
- image_encoder_local.eval()
971
- transformer_local.eval()
972
-
973
- # VAE slicing for low VRAM
974
- if not high_vram:
975
- vae_local.enable_slicing()
976
- vae_local.enable_tiling()
977
-
978
- # ์˜คํ”„๋กœ๋“œ์šฉ
979
- transformer_local.high_quality_fp32_output_for_inference = True
980
- transformer_local.to(dtype=torch.bfloat16)
981
- vae_local.to(dtype=torch.float16)
982
- image_encoder_local.to(dtype=torch.float16)
983
- text_encoder_local.to(dtype=torch.float16)
984
- text_encoder_2_local.to(dtype=torch.float16)
985
-
986
- # requires_grad_(False)
987
- for m in [vae_local, text_encoder_local, text_encoder_2_local, image_encoder_local, transformer_local]:
988
- m.requires_grad_(False)
989
-
990
- # GPU ๋ชจ๋“œ & VRAM ๋งŽ์œผ๋ฉด ์ „๋ถ€ GPU
991
- # ๊ทธ๋ ‡์ง€ ์•Š์œผ๋ฉด DynamicSwap
992
- if GPU_AVAILABLE:
993
- if not high_vram:
994
- DynamicSwapInstaller.install_model(transformer_local, device=gpu)
995
- DynamicSwapInstaller.install_model(text_encoder_local, device=gpu)
996
- else:
997
- text_encoder_local.to(gpu)
998
- text_encoder_2_local.to(gpu)
999
- image_encoder_local.to(gpu)
1000
- vae_local.to(gpu)
1001
- transformer_local.to(gpu)
1002
- else:
1003
- cpu_fallback_mode = True
1004
-
1005
- # ๊ธ€๋กœ๋ฒŒ์— ํ• ๋‹น
1006
- print("Model loaded.")
1007
- text_encoder = text_encoder_local
1008
- text_encoder_2 = text_encoder_2_local
1009
- tokenizer = tokenizer_local
1010
- tokenizer_2 = tokenizer_2_local
1011
- vae = vae_local
1012
- feature_extractor = feature_extractor_local
1013
- image_encoder = image_encoder_local
1014
- transformer = transformer_local
1015
-
1016
- #############################################
1017
- # Worker ๋กœ์ง (๋‘ ๋ฒˆ์งธ ์ฝ”๋“œ) ๊ทธ๋Œ€๋กœ
1018
- #############################################
1019
- stream = AsyncStream()
1020
-
1021
- outputs_folder = './outputs/'
1022
- os.makedirs(outputs_folder, exist_ok=True)
1023
 
 
1024
  @torch.no_grad()
1025
- def worker(
1026
- input_image, prompt, n_prompt, seed,
1027
- total_second_length, latent_window_size, steps,
1028
- cfg, gs, rs, gpu_memory_preservation, use_teacache
1029
- ):
1030
- """
1031
- ์‹ค์ œ ์ƒ˜ํ”Œ๋ง ๋กœ์ง (๋‘ ๋ฒˆ์งธ ์ฝ”๋“œ ๊ธฐ๋ฐ˜)
1032
- """
1033
- load_global_models() # ๋ชจ๋ธ ๋กœ๋”ฉ
1034
- global text_encoder, text_encoder_2, tokenizer, tokenizer_2
1035
- global vae, feature_extractor, image_encoder, transformer
1036
- global last_update_time
1037
-
1038
- # ์ตœ๋Œ€ 4์ดˆ๋กœ ๊ณ ์ •
1039
- total_second_length = min(total_second_length, 4.0)
1040
-
1041
  total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
1042
  total_latent_sections = int(max(round(total_latent_sections), 1))
1043
 
@@ -1046,38 +149,36 @@ def worker(
1046
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
1047
 
1048
  try:
1049
- # GPU ์ ์„ ๊ฒฝ์šฐ Unload
1050
- if not high_vram and GPU_AVAILABLE:
1051
  unload_complete_models(
1052
  text_encoder, text_encoder_2, image_encoder, vae, transformer
1053
  )
1054
 
1055
  # Text encoding
 
1056
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
1057
 
1058
- if not high_vram and GPU_AVAILABLE:
1059
- fake_diffusers_current_device(text_encoder, gpu)
1060
  load_model_as_complete(text_encoder_2, target_device=gpu)
1061
 
1062
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
1063
- if cfg == 1.0:
 
1064
  llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
1065
  else:
1066
  llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
1067
 
1068
- llama_vec, llama_mask = crop_or_pad_yield_mask(llama_vec, length=512)
1069
- llama_vec_n, llama_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
 
 
1070
 
1071
- # Image processing
1072
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
1073
 
1074
  H, W, C = input_image.shape
1075
  height, width = find_nearest_bucket(H, W, resolution=640)
1076
-
1077
- if cpu_fallback_mode:
1078
- height = min(height, 320)
1079
- width = min(width, 320)
1080
-
1081
  input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
1082
 
1083
  Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
@@ -1085,38 +186,42 @@ def worker(
1085
  input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
1086
  input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
1087
 
1088
- # VAE encode
 
1089
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
1090
 
1091
- if not high_vram and GPU_AVAILABLE:
1092
  load_model_as_complete(vae, target_device=gpu)
 
1093
  start_latent = vae_encode(input_image_pt, vae)
1094
 
1095
  # CLIP Vision
 
1096
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
1097
 
1098
- if not high_vram and GPU_AVAILABLE:
1099
  load_model_as_complete(image_encoder, target_device=gpu)
 
1100
  image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
1101
  image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
1102
 
1103
- # dtype
 
1104
  llama_vec = llama_vec.to(transformer.dtype)
1105
  llama_vec_n = llama_vec_n.to(transformer.dtype)
1106
  clip_l_pooler = clip_l_pooler.to(transformer.dtype)
1107
  clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
1108
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
1109
 
1110
- # Start sampling
 
1111
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
1112
 
1113
  rnd = torch.Generator("cpu").manual_seed(seed)
1114
 
1115
- # ์ดˆ๊ธฐ history latents
1116
  history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
1117
  history_pixels = None
1118
 
1119
- # start_latent ๋ถ™์ด๊ธฐ
1120
  history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
1121
  total_generated_latent_frames = 1
1122
 
@@ -1125,13 +230,12 @@ def worker(
1125
  stream.output_queue.push(('end', None))
1126
  return
1127
 
1128
- print(f'Section {section_index+1}/{total_latent_sections}')
1129
 
1130
- if not high_vram and GPU_AVAILABLE:
1131
  unload_complete_models()
1132
  move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
1133
 
1134
- # teacache
1135
  if use_teacache:
1136
  transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
1137
  else:
@@ -1140,79 +244,63 @@ def worker(
1140
  def callback(d):
1141
  preview = d['denoised']
1142
  preview = vae_decode_fake(preview)
 
1143
  preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
1144
  preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
1145
 
1146
  if stream.input_queue.top() == 'end':
1147
  stream.output_queue.push(('end', None))
1148
- raise KeyboardInterrupt('User stops generation.')
1149
 
1150
  current_step = d['i'] + 1
1151
  percentage = int(100.0 * current_step / steps)
1152
  hint = f'Sampling {current_step}/{steps}'
1153
- desc = f'Section {section_index+1}/{total_latent_sections}'
1154
  stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
1155
  return
1156
 
1157
- # indices
1158
- frames_per_section = latent_window_size * 4 - 3
1159
  indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
1160
- (
1161
- clean_latent_indices_start,
1162
- clean_latent_4x_indices,
1163
- clean_latent_2x_indices,
1164
- clean_latent_1x_indices,
1165
- latent_indices
1166
- ) = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
1167
-
1168
  clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
1169
 
1170
- clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -19:, :, :].split([16, 2, 1], dim=2)
1171
  clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
1172
 
1173
- try:
1174
- generated_latents = sample_hunyuan(
1175
- transformer=transformer,
1176
- sampler='unipc',
1177
- width=width,
1178
- height=height,
1179
- frames=frames_per_section,
1180
- real_guidance_scale=cfg,
1181
- distilled_guidance_scale=gs,
1182
- guidance_rescale=rs,
1183
- num_inference_steps=steps,
1184
- generator=rnd,
1185
- prompt_embeds=llama_vec,
1186
- prompt_embeds_mask=llama_mask,
1187
- prompt_poolers=clip_l_pooler,
1188
- negative_prompt_embeds=llama_vec_n,
1189
- negative_prompt_embeds_mask=llama_mask_n,
1190
- negative_prompt_poolers=clip_l_pooler_n,
1191
- device=gpu if GPU_AVAILABLE else cpu,
1192
- dtype=torch.bfloat16,
1193
- image_embeddings=image_encoder_last_hidden_state,
1194
- latent_indices=latent_indices,
1195
- clean_latents=clean_latents,
1196
- clean_latent_indices=clean_latent_indices,
1197
- clean_latents_2x=clean_latents_2x,
1198
- clean_latent_2x_indices=clean_latent_2x_indices,
1199
- clean_latents_4x=clean_latents_4x,
1200
- clean_latent_4x_indices=clean_latent_4x_indices,
1201
- callback=callback
1202
- )
1203
- except KeyboardInterrupt:
1204
- print("User cancelled.")
1205
- stream.output_queue.push(('end', None))
1206
- return
1207
- except Exception as e:
1208
- traceback.print_exc()
1209
- stream.output_queue.push(('end', None))
1210
- return
1211
 
1212
- total_generated_latent_frames += generated_latents.shape[2]
1213
  history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
1214
 
1215
- if not high_vram and GPU_AVAILABLE:
1216
  offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
1217
  load_model_as_complete(vae, target_device=gpu)
1218
 
@@ -1222,325 +310,178 @@ def worker(
1222
  history_pixels = vae_decode(real_history_latents, vae).cpu()
1223
  else:
1224
  section_latent_frames = latent_window_size * 2
1225
- overlapped_frames = frames_per_section
 
1226
  current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
1227
  history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
1228
 
1229
- if not high_vram and GPU_AVAILABLE:
1230
  unload_complete_models()
1231
 
1232
  output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
1233
- save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=16) # CRF=16
1234
 
1235
- stream.output_queue.push(('file', output_filename))
1236
 
 
 
 
1237
  except:
1238
  traceback.print_exc()
1239
- if not high_vram and GPU_AVAILABLE:
1240
- unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
 
 
 
1241
 
1242
  stream.output_queue.push(('end', None))
1243
  return
1244
 
1245
- def end_process():
1246
- """
1247
- ์ค‘๋‹จ ์š”์ฒญ
1248
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1249
  global stream
1250
- stream.input_queue.push('end')
1251
-
1252
- # Gradio์—์„œ ์ด worker ํ•จ์ˆ˜๋ฅผ ๋น„๋™๊ธฐ๋กœ ํ˜ธ์ถœ
1253
- def process(
1254
- input_image, prompt, n_prompt, seed,
1255
- total_second_length, latent_window_size, steps,
1256
- cfg, gs, rs, gpu_memory_preservation, use_teacache
1257
- ):
1258
- global stream
1259
- if input_image is None:
1260
- raise ValueError("No input image provided.")
1261
-
1262
- yield None, None, "", "", gr.update(interactive=False), gr.update(interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1263
 
1264
  stream = AsyncStream()
1265
- async_run(
1266
- worker,
1267
- input_image, prompt, n_prompt, seed,
1268
- total_second_length, latent_window_size, steps,
1269
- cfg, gs, rs, gpu_memory_preservation, use_teacache
1270
- )
1271
 
1272
  output_filename = None
1273
- prev_filename = None
1274
- error_message = None
1275
 
1276
  while True:
1277
  flag, data = stream.output_queue.next()
 
1278
  if flag == 'file':
1279
  output_filename = data
1280
- prev_filename = output_filename
1281
- yield output_filename, gr.update(), gr.update(), "", gr.update(interactive=False), gr.update(interactive=True)
1282
 
1283
- elif flag == 'progress':
1284
  preview, desc, html = data
1285
  yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
1286
 
1287
- elif flag == 'error':
1288
- error_message = data
1289
- print(f"Error: {error_message}")
1290
-
1291
- elif flag == 'end':
1292
- if output_filename is None and prev_filename:
1293
- output_filename = prev_filename
1294
- # ์—๋Ÿฌ๊ฐ€ ์žˆ์—ˆ์œผ๋ฉด ์—๋Ÿฌ ํ‘œ์‹œ
1295
- if error_message:
1296
- yield (
1297
- output_filename, # ๋งˆ์ง€๋ง‰ ํŒŒ์ผ (๋˜๋Š” None)
1298
- gr.update(visible=False),
1299
- gr.update(),
1300
- f"<div style='color:red;'>{error_message}</div>",
1301
- gr.update(interactive=True),
1302
- gr.update(interactive=False)
1303
- )
1304
- else:
1305
- yield (
1306
- output_filename, gr.update(visible=False), gr.update(), "", gr.update(interactive=True), gr.update(interactive=False)
1307
- )
1308
  break
1309
 
1310
- # UI CSS
1311
- def make_custom_css():
1312
- base_progress_css = make_progress_bar_css()
1313
- pastel_css = """
1314
- body {
1315
- background: #faf9ff !important;
1316
- font-family: "Noto Sans", sans-serif;
1317
- }
1318
- #app-container {
1319
- max-width: 1200px;
1320
- margin: 0 auto;
1321
- padding: 1rem;
1322
- position: relative;
1323
- }
1324
- #app-container h1 {
1325
- color: #5F5AA2;
1326
- margin-bottom: 1.2rem;
1327
- font-weight: 700;
1328
- text-shadow: 1px 1px 2px #bbb;
1329
- }
1330
- .gr-panel {
1331
- background: #ffffffcc;
1332
- border: 1px solid #e1dff0;
1333
- border-radius: 8px;
1334
- padding: 1rem;
1335
- box-shadow: 0 1px 3px rgba(0,0,0,0.1);
1336
- }
1337
- .button-container button {
1338
- min-height: 45px;
1339
- font-size: 1rem;
1340
- font-weight: 600;
1341
- border-radius: 6px;
1342
- }
1343
- .button-container button#start-button {
1344
- background-color: #A289E3 !important;
1345
- color: #fff !important;
1346
- border: 1px solid #a58de2;
1347
- }
1348
- .button-container button#stop-button {
1349
- background-color: #F48A9B !important;
1350
- color: #fff !important;
1351
- border: 1px solid #f18fa0;
1352
- }
1353
- .button-container button:hover {
1354
- filter: brightness(0.95);
1355
- }
1356
- .preview-container, .video-container {
1357
- border: 1px solid #ded9f2;
1358
- border-radius: 8px;
1359
- overflow: hidden;
1360
- }
1361
- .progress-container {
1362
- margin-top: 15px;
1363
- margin-bottom: 15px;
1364
- }
1365
- .error-message {
1366
- background-color: #FFF5F5;
1367
- border: 1px solid #FED7D7;
1368
- color: #E53E3E;
1369
- padding: 10px;
1370
- border-radius: 4px;
1371
- margin-top: 10px;
1372
- font-weight: 500;
1373
- }
1374
- @media (max-width: 768px) {
1375
- #app-container {
1376
- padding: 0.5rem;
1377
- }
1378
- .mobile-full-width {
1379
- flex-direction: column !important;
1380
- }
1381
- .mobile-full-width > .gr-block {
1382
- width: 100% !important;
1383
- }
1384
- }
1385
- """
1386
- return base_progress_css + pastel_css
1387
-
1388
- css = make_custom_css()
1389
-
1390
- # ์ƒ˜ํ”Œ ํ”„๋กฌํ”„ํŠธ
1391
  quick_prompts = [
1392
- ["The girl dances gracefully, with clear movements, full of charm."],
1393
- ["A character doing some simple body movements."]
1394
  ]
 
1395
 
1396
- # Gradio UI
 
1397
  block = gr.Blocks(css=css).queue()
1398
  with block:
1399
- gr.HTML("<div id='app-container'><h1>FramePack - Image to Video Generation</h1></div>")
1400
-
1401
- with gr.Row(elem_classes="mobile-full-width"):
1402
- # ์™ผ์ชฝ
1403
- with gr.Column(scale=1, elem_classes="gr-panel"):
1404
- input_image = gr.Image(
1405
- label=get_translation("upload_image"),
1406
- type="numpy",
1407
- height=320
1408
- )
1409
- prompt = gr.Textbox(
1410
- label=get_translation("prompt"),
1411
- value=''
1412
- )
1413
-
1414
- example_quick_prompts = gr.Dataset(
1415
- samples=quick_prompts,
1416
- label=get_translation("quick_prompts"),
1417
- samples_per_page=1000,
1418
- components=[prompt]
1419
- )
1420
- example_quick_prompts.click(
1421
- fn=lambda x: x[0],
1422
- inputs=[example_quick_prompts],
1423
- outputs=prompt,
1424
- show_progress=False,
1425
- queue=False
1426
- )
1427
-
1428
- # ์˜ค๋ฅธ์ชฝ
1429
- with gr.Column(scale=1, elem_classes="gr-panel"):
1430
- with gr.Row(elem_classes="button-container"):
1431
- start_button = gr.Button(
1432
- value=get_translation("start_generation"),
1433
- elem_id="start-button",
1434
- variant="primary"
1435
- )
1436
- stop_button = gr.Button(
1437
- value=get_translation("stop_generation"),
1438
- elem_id="stop-button",
1439
- interactive=False
1440
- )
1441
-
1442
- result_video = gr.Video(
1443
- label=get_translation("generated_video"),
1444
- autoplay=True,
1445
- loop=True,
1446
- height=320,
1447
- elem_classes="video-container"
1448
- )
1449
- preview_image = gr.Image(
1450
- label=get_translation("next_latents"),
1451
- visible=False,
1452
- height=150,
1453
- elem_classes="preview-container"
1454
- )
1455
- gr.Markdown(get_translation("sampling_note"))
1456
-
1457
- with gr.Group(elem_classes="progress-container"):
1458
- progress_desc = gr.Markdown('')
1459
- progress_bar = gr.HTML('')
1460
-
1461
- error_message = gr.HTML('', visible=True)
1462
-
1463
- # Advanced
1464
- with gr.Accordion("Advanced Settings", open=False, elem_classes="gr-panel"):
1465
- use_teacache = gr.Checkbox(
1466
- label=get_translation("use_teacache"),
1467
- value=True,
1468
- info=get_translation("teacache_info")
1469
- )
1470
- n_prompt = gr.Textbox(label=get_translation("negative_prompt"), value="", visible=False)
1471
- seed = gr.Number(
1472
- label=get_translation("seed"),
1473
- value=31337,
1474
- precision=0
1475
- )
1476
- # ๊ธฐ๋ณธ 2์ดˆ, ์ตœ๋Œ€ 4์ดˆ
1477
- total_second_length = gr.Slider(
1478
- label=get_translation("video_length"),
1479
- minimum=1,
1480
- maximum=4,
1481
- value=2,
1482
- step=0.1
1483
- )
1484
- latent_window_size = gr.Slider(
1485
- label=get_translation("latent_window"),
1486
- minimum=1,
1487
- maximum=33,
1488
- value=9,
1489
- step=1,
1490
- visible=False
1491
- )
1492
- steps = gr.Slider(
1493
- label=get_translation("steps"),
1494
- minimum=1,
1495
- maximum=100,
1496
- value=25,
1497
- step=1,
1498
- info=get_translation("steps_info")
1499
- )
1500
- cfg = gr.Slider(
1501
- label=get_translation("cfg_scale"),
1502
- minimum=1.0,
1503
- maximum=32.0,
1504
- value=1.0,
1505
- step=0.01,
1506
- visible=False
1507
- )
1508
- gs = gr.Slider(
1509
- label=get_translation("distilled_cfg"),
1510
- minimum=1.0,
1511
- maximum=32.0,
1512
- value=10.0,
1513
- step=0.01,
1514
- info=get_translation("distilled_cfg_info")
1515
- )
1516
- rs = gr.Slider(
1517
- label=get_translation("cfg_rescale"),
1518
- minimum=0.0,
1519
- maximum=1.0,
1520
- value=0.0,
1521
- step=0.01,
1522
- visible=False
1523
- )
1524
- gpu_memory_preservation = gr.Slider(
1525
- label=get_translation("gpu_memory"),
1526
- minimum=6,
1527
- maximum=128,
1528
- value=6,
1529
- step=0.1,
1530
- info=get_translation("gpu_memory_info")
1531
- )
1532
-
1533
- # ๋ฒ„ํŠผ ์ฒ˜๋ฆฌ
1534
- inputs_list = [
1535
- input_image, prompt, n_prompt, seed,
1536
- total_second_length, latent_window_size, steps,
1537
- cfg, gs, rs, gpu_memory_preservation, use_teacache
1538
- ]
1539
- start_button.click(
1540
- fn=process,
1541
- inputs=inputs_list,
1542
- outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, stop_button]
1543
- )
1544
- stop_button.click(fn=end_process)
1545
-
1546
- block.launch()
 
 
 
 
 
1
 
2
  import os
3
 
4
+ os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
 
 
5
 
6
  import gradio as gr
7
  import torch
 
10
  import safetensors.torch as sf
11
  import numpy as np
12
  import math
13
+ import spaces
14
+
15
+ from PIL import Image
16
+ from diffusers import AutoencoderKLHunyuanVideo
17
+ from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
18
+ from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
19
+ from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
20
+ from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
21
+ from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
22
+ from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  from diffusers_helper.thread_utils import AsyncStream, async_run
24
  from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
25
+ from transformers import SiglipImageProcessor, SiglipVisionModel
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  from diffusers_helper.clip_vision import hf_clip_vision_encode
27
+ from diffusers_helper.bucket_tools import find_nearest_bucket
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
 
 
 
29
 
30
+ free_mem_gb = get_cuda_free_memory_gb(gpu)
31
+ high_vram = free_mem_gb > 60
 
 
 
 
32
 
33
+ print(f'Free VRAM {free_mem_gb} GB')
34
+ print(f'High-VRAM Mode: {high_vram}')
 
 
 
 
 
 
 
 
35
 
36
+ text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
37
+ text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
38
+ tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
39
+ tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
40
+ vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
41
 
42
+ feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
43
+ image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
44
 
45
+ transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ vae.eval()
48
+ text_encoder.eval()
49
+ text_encoder_2.eval()
50
+ image_encoder.eval()
51
+ transformer.eval()
52
 
53
+ if not high_vram:
54
+ vae.enable_slicing()
55
+ vae.enable_tiling()
56
 
57
+ transformer.high_quality_fp32_output_for_inference = True
58
+ print('transformer.high_quality_fp32_output_for_inference = True')
59
 
60
+ transformer.to(dtype=torch.bfloat16)
61
+ vae.to(dtype=torch.float16)
62
+ image_encoder.to(dtype=torch.float16)
63
+ text_encoder.to(dtype=torch.float16)
64
+ text_encoder_2.to(dtype=torch.float16)
 
 
65
 
66
+ vae.requires_grad_(False)
67
+ text_encoder.requires_grad_(False)
68
+ text_encoder_2.requires_grad_(False)
69
+ image_encoder.requires_grad_(False)
70
+ transformer.requires_grad_(False)
71
 
72
+ if not high_vram:
73
+ # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
74
+ DynamicSwapInstaller.install_model(transformer, device=gpu)
75
+ DynamicSwapInstaller.install_model(text_encoder, device=gpu)
76
+ else:
77
+ text_encoder.to(gpu)
78
+ text_encoder_2.to(gpu)
79
+ image_encoder.to(gpu)
80
+ vae.to(gpu)
81
+ transformer.to(gpu)
82
 
83
+ stream = AsyncStream()
84
 
85
+ outputs_folder = './outputs/'
86
+ os.makedirs(outputs_folder, exist_ok=True)
 
 
87
 
88
+ examples = [
89
+ ["img_examples/1.png", "The girl dances gracefully, with clear movements, full of charm.",],
90
+ ["img_examples/2.jpg", "The man dances flamboyantly, swinging his hips and striking bold poses with dramatic flair."],
91
+ ["img_examples/3.png", "The woman dances elegantly among the blossoms, spinning slowly with flowing sleeves and graceful hand movements."],
92
+ ]
93
 
94
+ def generate_examples(input_image, prompt):
95
+
96
+ t2v=False
97
+ n_prompt=""
98
+ seed=31337
99
+ total_second_length=5
100
+ latent_window_size=9
101
+ steps=25
102
+ cfg=1.0
103
+ gs=10.0
104
+ rs=0.0
105
+ gpu_memory_preservation=6
106
+ use_teacache=True
107
+ mp4_crf=16
108
 
 
 
 
 
 
 
109
  global stream
110
+
111
+ # assert input_image is not None, 'No input image!'
112
+ if t2v:
113
+ default_height, default_width = 640, 640
114
+ input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
115
+ print("No input image provided. Using a blank white image.")
116
 
117
+ yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
118
 
119
  stream = AsyncStream()
120
+
121
+ async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
 
 
 
 
122
 
123
  output_filename = None
 
 
124
 
125
  while True:
126
  flag, data = stream.output_queue.next()
127
+
128
  if flag == 'file':
129
  output_filename = data
130
+ yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
 
131
 
132
+ if flag == 'progress':
133
  preview, desc, html = data
134
  yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
135
 
136
+ if flag == 'end':
137
+ yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  break
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
+
142
  @torch.no_grad()
143
+ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
145
  total_latent_sections = int(max(round(total_latent_sections), 1))
146
 
 
149
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
150
 
151
  try:
152
+ # Clean GPU
153
+ if not high_vram:
154
  unload_complete_models(
155
  text_encoder, text_encoder_2, image_encoder, vae, transformer
156
  )
157
 
158
  # Text encoding
159
+
160
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
161
 
162
+ if not high_vram:
163
+ fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
164
  load_model_as_complete(text_encoder_2, target_device=gpu)
165
 
166
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
167
+
168
+ if cfg == 1:
169
  llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
170
  else:
171
  llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
172
 
173
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
174
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
175
+
176
+ # Processing input image
177
 
 
178
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
179
 
180
  H, W, C = input_image.shape
181
  height, width = find_nearest_bucket(H, W, resolution=640)
 
 
 
 
 
182
  input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
183
 
184
  Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
 
186
  input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
187
  input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
188
 
189
+ # VAE encoding
190
+
191
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
192
 
193
+ if not high_vram:
194
  load_model_as_complete(vae, target_device=gpu)
195
+
196
  start_latent = vae_encode(input_image_pt, vae)
197
 
198
  # CLIP Vision
199
+
200
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
201
 
202
+ if not high_vram:
203
  load_model_as_complete(image_encoder, target_device=gpu)
204
+
205
  image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
206
  image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
207
 
208
+ # Dtype
209
+
210
  llama_vec = llama_vec.to(transformer.dtype)
211
  llama_vec_n = llama_vec_n.to(transformer.dtype)
212
  clip_l_pooler = clip_l_pooler.to(transformer.dtype)
213
  clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
214
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
215
 
216
+ # Sampling
217
+
218
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
219
 
220
  rnd = torch.Generator("cpu").manual_seed(seed)
221
 
 
222
  history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
223
  history_pixels = None
224
 
 
225
  history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
226
  total_generated_latent_frames = 1
227
 
 
230
  stream.output_queue.push(('end', None))
231
  return
232
 
233
+ print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
234
 
235
+ if not high_vram:
236
  unload_complete_models()
237
  move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
238
 
 
239
  if use_teacache:
240
  transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
241
  else:
 
244
  def callback(d):
245
  preview = d['denoised']
246
  preview = vae_decode_fake(preview)
247
+
248
  preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
249
  preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
250
 
251
  if stream.input_queue.top() == 'end':
252
  stream.output_queue.push(('end', None))
253
+ raise KeyboardInterrupt('User ends the task.')
254
 
255
  current_step = d['i'] + 1
256
  percentage = int(100.0 * current_step / steps)
257
  hint = f'Sampling {current_step}/{steps}'
258
+ desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
259
  stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
260
  return
261
 
 
 
262
  indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
263
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
 
 
 
 
 
 
 
264
  clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
265
 
266
+ clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
267
  clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
268
 
269
+ generated_latents = sample_hunyuan(
270
+ transformer=transformer,
271
+ sampler='unipc',
272
+ width=width,
273
+ height=height,
274
+ frames=latent_window_size * 4 - 3,
275
+ real_guidance_scale=cfg,
276
+ distilled_guidance_scale=gs,
277
+ guidance_rescale=rs,
278
+ # shift=3.0,
279
+ num_inference_steps=steps,
280
+ generator=rnd,
281
+ prompt_embeds=llama_vec,
282
+ prompt_embeds_mask=llama_attention_mask,
283
+ prompt_poolers=clip_l_pooler,
284
+ negative_prompt_embeds=llama_vec_n,
285
+ negative_prompt_embeds_mask=llama_attention_mask_n,
286
+ negative_prompt_poolers=clip_l_pooler_n,
287
+ device=gpu,
288
+ dtype=torch.bfloat16,
289
+ image_embeddings=image_encoder_last_hidden_state,
290
+ latent_indices=latent_indices,
291
+ clean_latents=clean_latents,
292
+ clean_latent_indices=clean_latent_indices,
293
+ clean_latents_2x=clean_latents_2x,
294
+ clean_latent_2x_indices=clean_latent_2x_indices,
295
+ clean_latents_4x=clean_latents_4x,
296
+ clean_latent_4x_indices=clean_latent_4x_indices,
297
+ callback=callback,
298
+ )
 
 
 
 
 
 
 
 
299
 
300
+ total_generated_latent_frames += int(generated_latents.shape[2])
301
  history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
302
 
303
+ if not high_vram:
304
  offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
305
  load_model_as_complete(vae, target_device=gpu)
306
 
 
310
  history_pixels = vae_decode(real_history_latents, vae).cpu()
311
  else:
312
  section_latent_frames = latent_window_size * 2
313
+ overlapped_frames = latent_window_size * 4 - 3
314
+
315
  current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
316
  history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
317
 
318
+ if not high_vram:
319
  unload_complete_models()
320
 
321
  output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
 
322
 
323
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
324
 
325
+ print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
326
+
327
+ stream.output_queue.push(('file', output_filename))
328
  except:
329
  traceback.print_exc()
330
+
331
+ if not high_vram:
332
+ unload_complete_models(
333
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
334
+ )
335
 
336
  stream.output_queue.push(('end', None))
337
  return
338
 
339
+ def get_duration(input_image, prompt, t2v, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
340
+ return total_second_length * 60
341
+
342
+ @spaces.GPU(duration=get_duration)
343
+ def process(input_image, prompt,
344
+ t2v=False,
345
+ n_prompt="",
346
+ seed=31337,
347
+ total_second_length=5,
348
+ latent_window_size=9,
349
+ steps=25,
350
+ cfg=1.0,
351
+ gs=10.0,
352
+ rs=0.0,
353
+ gpu_memory_preservation=6,
354
+ use_teacache=True,
355
+ mp4_crf=16
356
+ ):
357
  global stream
358
+
359
+ # assert input_image is not None, 'No input image!'
360
+ if t2v:
361
+ default_height, default_width = 640, 640
362
+ input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
363
+ print("No input image provided. Using a blank white image.")
364
+ else:
365
+ composite_rgba_uint8 = input_image["composite"]
366
+
367
+ # rgb_uint8 will be (H, W, 3), dtype uint8
368
+ rgb_uint8 = composite_rgba_uint8[:, :, :3]
369
+ # mask_uint8 will be (H, W), dtype uint8
370
+ mask_uint8 = composite_rgba_uint8[:, :, 3]
371
+
372
+ # Create background
373
+ h, w = rgb_uint8.shape[:2]
374
+ # White background, (H, W, 3), dtype uint8
375
+ background_uint8 = np.full((h, w, 3), 255, dtype=np.uint8)
376
+
377
+ # Normalize mask to range [0.0, 1.0].
378
+ alpha_normalized_float32 = mask_uint8.astype(np.float32) / 255.0
379
+
380
+ # Expand alpha to 3 channels to match RGB images for broadcasting.
381
+ # alpha_mask_float32 will have shape (H, W, 3)
382
+ alpha_mask_float32 = np.stack([alpha_normalized_float32] * 3, axis=2)
383
+
384
+ # alpha blending
385
+ blended_image_float32 = rgb_uint8.astype(np.float32) * alpha_mask_float32 + \
386
+ background_uint8.astype(np.float32) * (1.0 - alpha_mask_float32)
387
+
388
+ input_image = np.clip(blended_image_float32, 0, 255).astype(np.uint8)
389
+
390
+ yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
391
 
392
  stream = AsyncStream()
393
+
394
+ async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf)
 
 
 
 
395
 
396
  output_filename = None
 
 
397
 
398
  while True:
399
  flag, data = stream.output_queue.next()
400
+
401
  if flag == 'file':
402
  output_filename = data
403
+ yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
 
404
 
405
+ if flag == 'progress':
406
  preview, desc, html = data
407
  yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
408
 
409
+ if flag == 'end':
410
+ yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  break
412
 
413
+
414
+ def end_process():
415
+ stream.input_queue.push('end')
416
+
417
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  quick_prompts = [
419
+ 'The girl dances gracefully, with clear movements, full of charm.',
420
+ 'A character doing some simple body movements.',
421
  ]
422
+ quick_prompts = [[x] for x in quick_prompts]
423
 
424
+
425
+ css = make_progress_bar_css()
426
  block = gr.Blocks(css=css).queue()
427
  with block:
428
+ gr.Markdown('# FramePack-F1')
429
+ gr.Markdown(f"""### Video diffusion, but feels like image diffusion
430
+ *FramePack F1 - a FramePack model that only predicts future frames from history frames*
431
+ ### *beta* FramePack Fill ๐Ÿ–‹๏ธ- draw a mask over the input image to inpaint the video output
432
+ adapted from the officical code repo [FramePack](https://github.com/lllyasviel/FramePack) by [lllyasviel](lllyasviel/FramePack_F1_I2V_HY_20250503) and [FramePack Studio](https://github.com/colinurbs/FramePack-Studio) ๐Ÿ™Œ๐Ÿป
433
+ """)
434
+ with gr.Row():
435
+ with gr.Column():
436
+ input_image = gr.ImageEditor(type="numpy", label="Image", height=320, brush=gr.Brush(colors=["#ffffff"]))
437
+ prompt = gr.Textbox(label="Prompt", value='')
438
+ t2v = gr.Checkbox(label="do text-to-video", value=False)
439
+ example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Quick List', samples_per_page=1000, components=[prompt])
440
+ example_quick_prompts.click(lambda x: x[0], inputs=[example_quick_prompts], outputs=prompt, show_progress=False, queue=False)
441
+
442
+ with gr.Row():
443
+ start_button = gr.Button(value="Start Generation")
444
+ end_button = gr.Button(value="End Generation", interactive=False)
445
+
446
+ total_second_length = gr.Slider(label="Total Video Length (Seconds)", minimum=1, maximum=5, value=2, step=0.1)
447
+ with gr.Group():
448
+ with gr.Accordion("Advanced settings", open=False):
449
+ use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.')
450
+
451
+ n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False) # Not used
452
+ seed = gr.Number(label="Seed", value=31337, precision=0)
453
+
454
+
455
+ latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, visible=False) # Should not change
456
+ steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Changing this value is not recommended.')
457
+
458
+ cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=False) # Should not change
459
+ gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended.')
460
+ rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change
461
+
462
+ gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
463
+
464
+ mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
465
+
466
+ with gr.Column():
467
+ preview_image = gr.Image(label="Next Latents", height=200, visible=False)
468
+ result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
469
+ progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
470
+ progress_bar = gr.HTML('', elem_classes='no-generating-animation')
471
+
472
+ gr.HTML('<div style="text-align:center; margin-top:20px;">Share your results and find ideas at the <a href="https://x.com/search?q=framepack&f=live" target="_blank">FramePack Twitter (X) thread</a></div>')
473
+
474
+ ips = [input_image, prompt, t2v, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
475
+ start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
476
+ end_button.click(fn=end_process)
477
+
478
+ # gr.Examples(
479
+ # examples,
480
+ # inputs=[input_image, prompt],
481
+ # outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
482
+ # fn=generate_examples,
483
+ # cache_examples=True
484
+ # )
485
+
486
+
487
+ block.launch(share=True)