ginipick commited on
Commit
4418d0f
ยท
verified ยท
1 Parent(s): 1da4d19

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1178 -916
app.py CHANGED
@@ -1,21 +1,800 @@
1
- ########################################
2
  # from diffusers_helper.hf_login import login
3
- # ํ•„์š” ์‹œ ๋กœ๊ทธ์ธ ํ•จ์ˆ˜ ์‚ฌ์šฉ (์ฃผ์„ ํ•ด์ œ ํ›„)
4
- ########################################
5
 
6
  import os
7
- import threading
 
 
 
 
 
 
 
 
 
 
 
8
  import time
9
- import requests
10
- from requests.adapters import HTTPAdapter
11
- from urllib3.util.retry import Retry
12
- import json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  os.environ['HF_HOME'] = os.path.abspath(
15
  os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download'))
16
  )
17
 
18
- # ๋‹จ์ผ ์–ธ์–ด(์˜์–ด)๋งŒ ์‚ฌ์šฉํ•˜๊ธฐ ์œ„ํ•œ ๋ฒˆ์—ญ ๋”•์…”๋„ˆ๋ฆฌ
 
 
 
 
 
 
 
 
 
 
 
 
19
  translations = {
20
  "en": {
21
  "title": "FramePack - Image to Video Generation",
@@ -28,6 +807,7 @@ translations = {
28
  "teacache_info": "Faster speed, but may result in slightly worse finger and hand generation.",
29
  "negative_prompt": "Negative Prompt",
30
  "seed": "Seed",
 
31
  "video_length": "Video Length (max 4 seconds)",
32
  "latent_window": "Latent Window Size",
33
  "steps": "Inference Steps",
@@ -40,7 +820,7 @@ translations = {
40
  "gpu_memory_info": "Set this to a larger value if you encounter OOM errors. Larger values cause slower speed.",
41
  "next_latents": "Next Latents",
42
  "generated_video": "Generated Video",
43
- "sampling_note": "Note: Due to reversed sampling, ending actions will be generated before starting actions. If the starting action is not in the video, please wait, it will be generated later.",
44
  "error_message": "Error",
45
  "processing_error": "Processing error",
46
  "network_error": "Network connection is unstable, model download timed out. Please try again later.",
@@ -51,75 +831,14 @@ translations = {
51
  }
52
  }
53
 
54
- # ์˜์–ด๋งŒ ์‚ฌ์šฉํ•  ๊ฒƒ์ด๋ฏ€๋กœ ์•„๋ž˜ ํ•จ์ˆ˜๋Š” ์‚ฌ์‹ค์ƒ ํ•ญ์ƒ ์˜์–ด๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
55
  def get_translation(key):
56
  return translations["en"].get(key, key)
57
 
58
- # ์–ธ์–ด๋Š” ์˜์–ด๋กœ ๊ณ ์ •
59
- current_language = "en"
60
-
61
- import gradio as gr
62
- import torch
63
- import traceback
64
- import einops
65
- import safetensors.torch as sf
66
- import numpy as np
67
- import math
68
-
69
- # Hugging Face Space ํ™˜๊ฒฝ ์ฒดํฌ
70
- IN_HF_SPACE = os.environ.get('SPACE_ID') is not None
71
-
72
- # GPU ์‚ฌ์šฉ ์—ฌ๋ถ€ ์ „์—ญ ๊ด€๋ฆฌ
73
- GPU_AVAILABLE = False
74
- GPU_INITIALIZED = False
75
- last_update_time = time.time()
76
-
77
- if IN_HF_SPACE:
78
- try:
79
- import spaces
80
- print("Running in Hugging Face Space environment.")
81
- try:
82
- GPU_AVAILABLE = torch.cuda.is_available()
83
- print(f"GPU available: {GPU_AVAILABLE}")
84
- if GPU_AVAILABLE:
85
- test_tensor = torch.zeros(1, device='cuda') + 1
86
- del test_tensor
87
- print("GPU small test pass")
88
- except Exception as e:
89
- GPU_AVAILABLE = False
90
- print(f"Error checking GPU: {e}")
91
- except ImportError:
92
- GPU_AVAILABLE = torch.cuda.is_available()
93
-
94
- from PIL import Image
95
- from diffusers import AutoencoderKLHunyuanVideo
96
- from transformers import (
97
- LlamaModel,
98
- CLIPTextModel,
99
- LlamaTokenizerFast,
100
- CLIPTokenizer,
101
- SiglipImageProcessor,
102
- SiglipVisionModel
103
- )
104
-
105
- from diffusers_helper.hunyuan import (
106
- encode_prompt_conds,
107
- vae_decode,
108
- vae_encode,
109
- vae_decode_fake
110
- )
111
-
112
- from diffusers_helper.utils import (
113
- save_bcthw_as_mp4,
114
- crop_or_pad_yield_mask,
115
- soft_append_bcthw,
116
- resize_and_center_crop,
117
- generate_timestamp
118
- )
119
-
120
- from diffusers_helper.bucket_tools import find_nearest_bucket
121
- from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
122
- from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
123
  from diffusers_helper.memory import (
124
  cpu,
125
  gpu,
@@ -131,644 +850,326 @@ from diffusers_helper.memory import (
131
  unload_complete_models,
132
  load_model_as_complete
133
  )
134
-
135
- from diffusers_helper.thread_utils import AsyncStream, async_run
136
- from diffusers_helper.clip_vision import hf_clip_vision_encode
137
- from diffusers_helper.gradio.progress_bar import (
138
- make_progress_bar_css,
139
- make_progress_bar_html
 
 
 
 
140
  )
 
 
 
141
 
142
- outputs_folder = './outputs/'
143
- os.makedirs(outputs_folder, exist_ok=True)
 
 
 
 
144
 
145
- # GPU ๋ฉ”๋ชจ๋ฆฌ ํ™•์ธ
146
- if not IN_HF_SPACE:
147
- try:
148
- if torch.cuda.is_available():
149
- free_mem_gb = get_cuda_free_memory_gb(gpu)
150
- print(f'Free VRAM: {free_mem_gb} GB')
151
- else:
152
- free_mem_gb = 6.0
153
- print("CUDA not available, default memory setting used.")
154
- except Exception as e:
155
- free_mem_gb = 6.0
156
- print(f"Error getting GPU mem: {e}, using default=6GB")
157
- high_vram = free_mem_gb > 60
158
- else:
159
- print("Using default memory setting in Spaces environment.")
160
  try:
161
- if GPU_AVAILABLE:
162
- free_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9 * 0.9
163
- high_vram = (free_mem_gb > 10)
164
- else:
165
- free_mem_gb = 6.0
166
- high_vram = False
167
- except Exception as e:
168
- free_mem_gb = 6.0
169
- high_vram = False
170
- print(f'GPU memory: {free_mem_gb:.2f} GB, High-VRAM mode: {high_vram}')
171
-
172
- models = {}
173
  cpu_fallback_mode = not GPU_AVAILABLE
 
174
 
175
- def load_models():
176
- """
177
- Load or initialize the global models
178
- """
179
- global models, cpu_fallback_mode, GPU_INITIALIZED
180
-
181
- if GPU_INITIALIZED:
182
- print("Models are already loaded, skipping re-initialization.")
183
- return models
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
- print("Start loading models...")
 
186
 
187
- try:
188
- device = 'cuda' if GPU_AVAILABLE and not cpu_fallback_mode else 'cpu'
189
- model_device = 'cpu'
190
-
191
- dtype = torch.float16 if GPU_AVAILABLE else torch.float32
192
- transformer_dtype = torch.bfloat16 if GPU_AVAILABLE else torch.float32
193
-
194
- print(f"Device: {device}, VAE/Encoders dtype={dtype}, Transformer dtype={transformer_dtype}")
195
-
196
- try:
197
- # (1) ํ…์ŠคํŠธ ์ธ์ฝ”๋”
198
- text_encoder = LlamaModel.from_pretrained(
199
- "hunyuanvideo-community/HunyuanVideo",
200
- subfolder='text_encoder',
201
- torch_dtype=dtype
202
- ).to(model_device)
203
-
204
- text_encoder_2 = CLIPTextModel.from_pretrained(
205
- "hunyuanvideo-community/HunyuanVideo",
206
- subfolder='text_encoder_2',
207
- torch_dtype=dtype
208
- ).to(model_device)
209
-
210
- tokenizer = LlamaTokenizerFast.from_pretrained(
211
- "hunyuanvideo-community/HunyuanVideo",
212
- subfolder='tokenizer'
213
- )
214
- tokenizer_2 = CLIPTokenizer.from_pretrained(
215
- "hunyuanvideo-community/HunyuanVideo",
216
- subfolder='tokenizer_2'
217
- )
218
 
219
- # (2) VAE
220
- vae = AutoencoderKLHunyuanVideo.from_pretrained(
221
- "hunyuanvideo-community/HunyuanVideo",
222
- subfolder='vae',
223
- torch_dtype=dtype
224
- ).to(model_device)
225
 
226
- # (3) CLIP Vision
227
- feature_extractor = SiglipImageProcessor.from_pretrained(
228
- "lllyasviel/flux_redux_bfl", subfolder='feature_extractor'
229
- )
230
- image_encoder = SiglipVisionModel.from_pretrained(
231
- "lllyasviel/flux_redux_bfl",
232
- subfolder='image_encoder',
233
- torch_dtype=dtype
234
- ).to(model_device)
235
-
236
- # (4) Transformer (FramePack_F1)
237
- #
238
- # ๊ธฐ์กด: "lllyasviel/FramePackI2V_HY"
239
- # ๋ณ€๊ฒฝ: "lllyasviel/FramePack_F1_I2V_HY_20250503" (2๋ฒˆ์งธ ์ฝ”๋“œ์—์„œ ์ œ์‹œ๋จ)
240
- #
241
- transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained(
242
- "lllyasviel/FramePack_F1_I2V_HY_20250503",
243
- torch_dtype=transformer_dtype
244
- ).to(model_device)
245
-
246
- print("All models loaded successfully.")
247
- except Exception as e:
248
- print(f"Error loading models: {e}")
249
- print("Retry with float32 on CPU...")
250
- dtype = torch.float32
251
- transformer_dtype = torch.float32
252
- cpu_fallback_mode = True
253
-
254
- text_encoder = LlamaModel.from_pretrained(
255
- "hunyuanvideo-community/HunyuanVideo",
256
- subfolder='text_encoder',
257
- torch_dtype=dtype
258
- ).to('cpu')
259
- text_encoder_2 = CLIPTextModel.from_pretrained(
260
- "hunyuanvideo-community/HunyuanVideo",
261
- subfolder='text_encoder_2',
262
- torch_dtype=dtype
263
- ).to('cpu')
264
- tokenizer = LlamaTokenizerFast.from_pretrained(
265
- "hunyuanvideo-community/HunyuanVideo",
266
- subfolder='tokenizer'
267
- )
268
- tokenizer_2 = CLIPTokenizer.from_pretrained(
269
- "hunyuanvideo-community/HunyuanVideo",
270
- subfolder='tokenizer_2'
271
- )
272
- vae = AutoencoderKLHunyuanVideo.from_pretrained(
273
- "hunyuanvideo-community/HunyuanVideo",
274
- subfolder='vae',
275
- torch_dtype=dtype
276
- ).to('cpu')
277
-
278
- feature_extractor = SiglipImageProcessor.from_pretrained(
279
- "lllyasviel/flux_redux_bfl", subfolder='feature_extractor'
280
- )
281
- image_encoder = SiglipVisionModel.from_pretrained(
282
- "lllyasviel/flux_redux_bfl",
283
- subfolder='image_encoder',
284
- torch_dtype=dtype
285
- ).to('cpu')
286
-
287
- transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained(
288
- "lllyasviel/FramePack_F1_I2V_HY_20250503",
289
- torch_dtype=transformer_dtype
290
- ).to('cpu')
291
-
292
- print("Loaded in CPU-only fallback mode.")
293
-
294
- vae.eval()
295
- text_encoder.eval()
296
- text_encoder_2.eval()
297
- image_encoder.eval()
298
- transformer.eval()
299
-
300
- if not high_vram or cpu_fallback_mode:
301
- vae.enable_slicing()
302
- vae.enable_tiling()
303
-
304
- # FramePack_F1 ๋ชจ๋ธ์—์„œ ํ•„์š”
305
- transformer.high_quality_fp32_output_for_inference = True
306
- print("transformer.high_quality_fp32_output_for_inference = True")
307
-
308
- if not cpu_fallback_mode:
309
- transformer.to(dtype=transformer_dtype)
310
- vae.to(dtype=dtype)
311
- image_encoder.to(dtype=dtype)
312
- text_encoder.to(dtype=dtype)
313
- text_encoder_2.to(dtype=dtype)
314
-
315
- vae.requires_grad_(False)
316
- text_encoder.requires_grad_(False)
317
- text_encoder_2.requires_grad_(False)
318
- image_encoder.requires_grad_(False)
319
- transformer.requires_grad_(False)
320
-
321
- if torch.cuda.is_available() and not cpu_fallback_mode:
322
- try:
323
- if not high_vram:
324
- # VRAM์ด ์ ๋‹ค๋ฉด DynamicSwapInstaller๋กœ ํ•„์š” ์‹œ GPU/CPU ์Šค์™‘
325
- DynamicSwapInstaller.install_model(transformer, device=device)
326
- DynamicSwapInstaller.install_model(text_encoder, device=device)
327
- else:
328
- text_encoder.to(device)
329
- text_encoder_2.to(device)
330
- image_encoder.to(device)
331
- vae.to(device)
332
- transformer.to(device)
333
- print(f"Moved models to {device}")
334
- except Exception as e:
335
- print(f"Error moving models to {device}: {e}, fallback to CPU")
336
- cpu_fallback_mode = True
337
-
338
- models_local = {
339
- 'text_encoder': text_encoder,
340
- 'text_encoder_2': text_encoder_2,
341
- 'tokenizer': tokenizer,
342
- 'tokenizer_2': tokenizer_2,
343
- 'vae': vae,
344
- 'feature_extractor': feature_extractor,
345
- 'image_encoder': image_encoder,
346
- 'transformer': transformer
347
- }
348
 
349
- GPU_INITIALIZED = True
350
- models.update(models_local)
351
- print(f"Model load complete. Running mode: {'CPU' if cpu_fallback_mode else 'GPU'}")
352
- return models
353
- except Exception as e:
354
- print(f"Unexpected error in load_models(): {e}")
355
- traceback.print_exc()
356
- cpu_fallback_mode = True
357
- return {}
358
 
359
- # GPU ๋ฐ์ฝ”๋ ˆ์ดํ„ฐ (Spaces ์ „์šฉ)
360
- if IN_HF_SPACE and 'spaces' in globals() and GPU_AVAILABLE:
361
- try:
362
- @spaces.GPU
363
- def initialize_models():
364
- global GPU_INITIALIZED
365
- try:
366
- result = load_models()
367
- GPU_INITIALIZED = True
368
- return result
369
- except Exception as e:
370
- print(f"Error in @spaces.GPU model init: {e}")
371
- global cpu_fallback_mode
372
- cpu_fallback_mode = True
373
- return load_models()
374
- except Exception as e:
375
- print(f"Error creating spaces.GPU decorator: {e}")
376
- def initialize_models():
377
- return load_models()
378
- else:
379
- def initialize_models():
380
- return load_models()
381
-
382
- def get_models():
383
- """
384
- Retrieve or load models if not loaded yet.
385
- """
386
- global models
387
- model_loading_key = "__model_loading__"
388
-
389
- if not models:
390
- if model_loading_key in globals():
391
- print("Models are loading, please wait...")
392
- import time
393
- start_wait = time.time()
394
- while (not models) and (model_loading_key in globals()):
395
- time.sleep(0.5)
396
- if time.time() - start_wait > 60:
397
- print("Timed out waiting for model load.")
398
- break
399
- if models:
400
- return models
401
- try:
402
- globals()[model_loading_key] = True
403
- if IN_HF_SPACE and 'spaces' in globals() and GPU_AVAILABLE and not cpu_fallback_mode:
404
- try:
405
- print("Loading models via @spaces.GPU decorator.")
406
- models_local = initialize_models()
407
- models.update(models_local)
408
- except Exception as e:
409
- print(f"Error with GPU decorator: {e}, direct load fallback.")
410
- models_local = load_models()
411
- models.update(models_local)
412
- else:
413
- models_local = load_models()
414
- models.update(models_local)
415
- except Exception as e:
416
- print(f"Unexpected error while loading models: {e}")
417
- models.clear()
418
- finally:
419
- if model_loading_key in globals():
420
- del globals()[model_loading_key]
421
- return models
422
 
423
- stream = AsyncStream()
 
 
 
 
424
 
425
- def create_error_html(error_msg, is_timeout=False):
426
- """
427
- Create a user-friendly error message in English only
428
- """
429
- if is_timeout:
430
- if "partial" in error_msg:
431
- en_msg = "Processing timed out, but partial video has been generated."
432
- else:
433
- en_msg = f"Processing timed out: {error_msg}"
434
- elif "model load" in error_msg.lower():
435
- en_msg = "Failed to load models. Possibly heavy traffic or GPU issues."
436
- elif "gpu" in error_msg.lower() or "cuda" in error_msg.lower() or "memory" in error_msg.lower():
437
- en_msg = "GPU memory insufficient or error. Please try increasing GPU memory or reduce video length."
438
- elif "sampling" in error_msg.lower():
439
- if "partial" in error_msg.lower():
440
- en_msg = "Error during sampling process, but partial video has been generated."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
  else:
442
- en_msg = "Error during sampling process. Unable to generate video."
443
- elif "timeout" in error_msg.lower():
444
- en_msg = "Network or model download timed out. Please try again later."
 
 
445
  else:
446
- en_msg = f"Processing error: {error_msg}"
447
-
448
- return f"""
449
- <div class="error-message" id="custom-error-container">
450
- <div>
451
- <span class="error-icon">โš ๏ธ</span> {en_msg}
452
- </div>
453
- </div>
454
- <script>
455
- // Hide default Gradio error UI
456
- (function() {{
457
- const defaultErrorElements = document.querySelectorAll('.error');
458
- defaultErrorElements.forEach(el => {{
459
- el.style.display = 'none';
460
- }});
461
- }})();
462
- </script>
463
- """
 
 
464
 
465
  @torch.no_grad()
466
  def worker(
467
- input_image,
468
- prompt,
469
- n_prompt,
470
- seed,
471
- total_second_length,
472
- latent_window_size,
473
- steps,
474
- cfg,
475
- gs,
476
- rs,
477
- gpu_memory_preservation,
478
- use_teacache
479
  ):
480
  """
481
- ์ตœ์ข… ์˜์ƒ ์ƒ์„ฑ ๋กœ์ง (๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ๋™์ž‘)
482
  """
 
 
 
483
  global last_update_time
484
- last_update_time = time.time()
485
 
486
- # ๊ธฐ๋ณธ 2์ดˆ, ์ตœ๋Œ€ 4์ดˆ๋กœ ์ œํ•œ
487
  total_second_length = min(total_second_length, 4.0)
488
 
489
- try:
490
- models_local = get_models()
491
- if not models_local:
492
- error_msg = "Model load failed. Check logs for details."
493
- print(error_msg)
494
- stream.output_queue.push(('error', error_msg))
495
- stream.output_queue.push(('end', None))
496
- return
497
-
498
- text_encoder = models_local['text_encoder']
499
- text_encoder_2 = models_local['text_encoder_2']
500
- tokenizer = models_local['tokenizer']
501
- tokenizer_2 = models_local['tokenizer_2']
502
- vae = models_local['vae']
503
- feature_extractor = models_local['feature_extractor']
504
- image_encoder = models_local['image_encoder']
505
- transformer = models_local['transformer']
506
- except Exception as e:
507
- err = f"Error retrieving models: {e}"
508
- print(err)
509
- traceback.print_exc()
510
- stream.output_queue.push(('error', err))
511
- stream.output_queue.push(('end', None))
512
- return
513
-
514
- device = 'cuda' if (GPU_AVAILABLE and not cpu_fallback_mode) else 'cpu'
515
- print(f"Inference device: {device}")
516
-
517
- # total_second_length๋งŒํผ 30fps๋กœ ๋งŒ๋“ค ๋•Œ, latent_window_size*4-3 ํ”„๋ ˆ์ž„ ๋‹จ์œ„๊ฐ€ ์—ฌ๋Ÿฌ ๋ฒˆ ์ด์–ด์ ธ์•ผ ํ•จ.
518
- # ๋‹จ์ˆœํžˆ (์ด์ดˆ * fps)/(latent_window_size*4-3) ๋กœ ๋ฐ˜๋ณต ํšŸ์ˆ˜๋ฅผ ๊ตฌํ•จ
519
- # 2๋ฒˆ์งธ ์˜ˆ์‹œ ์ฝ”๋“œ์ฒ˜๋Ÿผ, ์„น์…˜ ๋ฐ˜๋ณต ๋ฐฉ์‹์œผ๋กœ ๊ตฌํ˜„
520
-
521
- # 'FramePack_F1' ๋ชจ๋ธ ๊ธฐ์ค€์œผ๋กœ, ์•„๋ž˜ ๋ฐฉ์‹์œผ๋กœ "์กฐ๊ธˆ์”ฉ" ์˜์ƒ์„ ํ™•์žฅํ•ด๊ฐ€๋ฉฐ ์ƒ˜ํ”Œ๋ง
522
  total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
523
  total_latent_sections = int(max(round(total_latent_sections), 1))
524
 
525
  job_id = generate_timestamp()
526
- last_output_filename = None
527
- history_latents = None
528
- history_pixels = None
529
- total_generated_latent_frames = 0
530
 
531
- # ์ดˆ๊ธฐ ๋ฉ”์‹œ์ง€
532
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
533
 
534
  try:
535
- # VRAM ์ ์„ ๊ฒฝ์šฐ, ๋ฏธ๋ฆฌ Unload
536
- if not high_vram and not cpu_fallback_mode:
537
- try:
538
- unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
539
- except Exception as e:
540
- print(f"Error unloading models: {e}")
541
 
542
- # (1) Text Encode
543
- last_update_time = time.time()
544
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding...'))))
545
 
546
- try:
547
- if not high_vram and not cpu_fallback_mode:
548
- # Dynamic ์˜คํ”„๋กœ๋”ฉ
549
- fake_diffusers_current_device(text_encoder, device)
550
- load_model_as_complete(text_encoder_2, target_device=device)
551
 
552
- llama_vec, clip_l_pooler = encode_prompt_conds(
553
- prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2
554
- )
555
- if cfg == 1:
556
- llama_vec_n, clip_l_pooler_n = (
557
- torch.zeros_like(llama_vec),
558
- torch.zeros_like(clip_l_pooler),
559
- )
560
- else:
561
- llama_vec_n, clip_l_pooler_n = encode_prompt_conds(
562
- n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2
563
- )
564
- llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
565
- llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
566
- except Exception as e:
567
- err = f"Text encoding error: {e}"
568
- print(err)
569
- traceback.print_exc()
570
- stream.output_queue.push(('error', err))
571
- stream.output_queue.push(('end', None))
572
- return
573
-
574
- # (2) Image processing
575
- last_update_time = time.time()
576
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing...'))))
577
-
578
- try:
579
- H, W, C = input_image.shape
580
- # ํ•ด์ƒ๋„ ๋ฒ„ํ‚ท
581
- height, width = find_nearest_bucket(H, W, resolution=640)
582
-
583
- # CPU ๋ชจ๋“œ๋ฉด ํ•ด์ƒ๋„ ๋„ˆ๋ฌด ํฌ์ง€ ์•Š๊ฒŒ
584
- if cpu_fallback_mode:
585
- height = min(height, 320)
586
- width = min(width, 320)
587
-
588
- input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
589
- Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
590
-
591
- input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
592
- input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
593
- except Exception as e:
594
- err = f"Image preprocess error: {e}"
595
- print(err)
596
- traceback.print_exc()
597
- stream.output_queue.push(('error', err))
598
- stream.output_queue.push(('end', None))
599
- return
600
-
601
- # (3) VAE Encoding
602
- last_update_time = time.time()
603
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding...'))))
604
-
605
- try:
606
- if not high_vram and not cpu_fallback_mode:
607
- load_model_as_complete(vae, target_device=device)
608
- start_latent = vae_encode(input_image_pt, vae)
609
- except Exception as e:
610
- err = f"VAE encode error: {e}"
611
- print(err)
612
- traceback.print_exc()
613
- stream.output_queue.push(('error', err))
614
- stream.output_queue.push(('end', None))
615
- return
616
-
617
- # (4) CLIP Vision
618
- last_update_time = time.time()
619
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encode...'))))
620
-
621
- try:
622
- if not high_vram and not cpu_fallback_mode:
623
- load_model_as_complete(image_encoder, target_device=device)
624
- image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
625
- image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
626
- except Exception as e:
627
- err = f"CLIP Vision encode error: {e}"
628
- print(err)
629
- traceback.print_exc()
630
- stream.output_queue.push(('error', err))
631
- stream.output_queue.push(('end', None))
632
- return
633
-
634
- # (5) dtype ๋ณ€ํ™˜
635
- try:
636
- llama_vec = llama_vec.to(transformer.dtype)
637
- llama_vec_n = llama_vec_n.to(transformer.dtype)
638
- clip_l_pooler = clip_l_pooler.to(transformer.dtype)
639
- clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
640
- image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
641
- except Exception as e:
642
- err = f"Data type conversion error: {e}"
643
- print(err)
644
- traceback.print_exc()
645
- stream.output_queue.push(('error', err))
646
- stream.output_queue.push(('end', None))
647
- return
648
-
649
- # (6) Sampling ๋ฐ˜๋ณต
650
- last_update_time = time.time()
651
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling...'))))
652
 
653
  rnd = torch.Generator("cpu").manual_seed(seed)
654
 
655
- # FramePack_F1 ๋ชจ๋ธ์—์„œ, ์ฒ˜์Œ์—๋Š” history_latents = [start_latent] ์ •๋„
656
- # 2๋ฒˆ์งธ ์ฝ”๋“œ์ฒ˜๋Ÿผ, ์šฐ์„  history_latents ์— start_latent ๋„ฃ๊ณ , ์„น์…˜๋ณ„๋กœ ํ™•์žฅ
657
- try:
658
- history_latents = start_latent.cpu()
659
- history_pixels = None
660
- total_generated_latent_frames = start_latent.shape[2] # ๋ณดํ†ต 1
661
- except Exception as e:
662
- err = f"Init history state error: {e}"
663
- print(err)
664
- traceback.print_exc()
665
- stream.output_queue.push(('error', err))
666
- stream.output_queue.push(('end', None))
667
- return
668
-
669
- # mp4 CRF(ํ’ˆ์งˆ) ๋“ฑ์€ ๊ณ ์ •(16 ๋“ฑ) ๊ฐ€๋Šฅ. ์—ฌ๊ธฐ์„œ๋Š” ๊ฐ„๋‹จํžˆ CRF=16
670
- mp4_crf = 16
671
 
672
  for section_index in range(total_latent_sections):
673
  if stream.input_queue.top() == 'end':
674
- # ์‚ฌ์šฉ์ž ์ค‘๋‹จ
675
- if history_pixels is not None and total_generated_latent_frames > 0:
676
- try:
677
- outname = os.path.join(
678
- outputs_folder, f'{job_id}_final_{total_generated_latent_frames}.mp4'
679
- )
680
- save_bcthw_as_mp4(history_pixels, outname, fps=30, crf=mp4_crf)
681
- stream.output_queue.push(('file', outname))
682
- except Exception as e:
683
- print(f"Error saving final partial video: {e}")
684
  stream.output_queue.push(('end', None))
685
  return
686
 
687
- print(f"Section {section_index+1}/{total_latent_sections}")
688
-
689
- # ๋ชจ๋ธ ์Šค์™‘
690
- if not high_vram and not cpu_fallback_mode:
691
- try:
692
- unload_complete_models()
693
- move_model_to_device_with_memory_preservation(
694
- transformer, target_device=device, preserved_memory_gb=gpu_memory_preservation
695
- )
696
- except Exception as e:
697
- print(f"Error moving transformer to GPU: {e}")
698
-
699
- if use_teacache and not cpu_fallback_mode:
700
- try:
701
- transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
702
- except Exception as e:
703
- print(f"Error init teacache: {e}")
704
- transformer.initialize_teacache(enable_teacache=False)
705
  else:
706
  transformer.initialize_teacache(enable_teacache=False)
707
 
708
- # ์ฝœ๋ฐฑ
709
  def callback(d):
710
- global last_update_time
711
- last_update_time = time.time()
712
- try:
713
- if stream.input_queue.top() == 'end':
714
- stream.output_queue.push(('end', None))
715
- raise KeyboardInterrupt('User requested stop.')
716
- preview = d['denoised']
717
- preview = vae_decode_fake(preview)
718
- preview = (preview * 255.0).cpu().numpy().clip(0,255).astype(np.uint8)
719
- preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
720
-
721
- curr_step = d['i'] + 1
722
- percentage = int(100.0 * curr_step / steps)
723
- hint = f'Sampling {curr_step}/{steps}'
724
- desc = f'Section {section_index+1}/{total_latent_sections}'
725
- barhtml = make_progress_bar_html(percentage, hint)
726
- stream.output_queue.push(('progress', (preview, desc, barhtml)))
727
- except KeyboardInterrupt:
728
- raise
729
- except Exception as e:
730
- print(f"Callback error: {e}")
731
  return
732
 
733
- # 2๋ฒˆ์งธ ์˜ˆ์‹œ์ฒ˜๋Ÿผ indices split
734
- # FramePack_F1: [1, 16, 2, 1, latent_window_size] ๋ฐฉ์‹
735
- try:
736
- # ํ•œ ๋ฒˆ ์ƒ˜ํ”Œ๋งํ•  ํ”„๋ ˆ์ž„ ์ˆ˜
737
- frames_per_section = latent_window_size * 4 - 3
738
-
739
- # indices ์ค€๋น„
740
- indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
741
- (
742
- clean_latent_indices_start,
743
- clean_latent_4x_indices,
744
- clean_latent_2x_indices,
745
- clean_latent_1x_indices,
746
- latent_indices
747
- ) = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
748
-
749
- # history_latents ์—์„œ ๋’ท๋ถ€๋ถ„ 16+2+1=19 ํ”„๋ ˆ์ž„์งœ๋ฆฌ๋ฅผ ๋‚˜๋ˆ ์„œ clean_latents_xx ๋กœ ์ถ”์ถœ
750
- if history_latents.shape[2] < 19:
751
- # ํ˜น์€ ์ดˆ๊ธฐ ์ƒํƒœ๋ผ 19ํ”„๋ ˆ์ž„์ด ์—†์„ ์ˆ˜๋„ ์žˆ์œผ๋ฏ€๋กœ ํŒจ๋”ฉ
752
- # ์—ฌ๊ธฐ์„œ๋Š” ๋‹จ์ˆœํžˆ history_latents ์ „๋ถ€๋ฅผ 19ํ”„๋ ˆ์ž„์œผ๋กœ ๋งž์ถฐ์ฃผ๊ธฐ
753
- needed = 19 - history_latents.shape[2]
754
- if needed > 0:
755
- pad_shape = list(history_latents.shape)
756
- pad_shape[2] = needed
757
- pad_zeros = torch.zeros(pad_shape, dtype=history_latents.dtype)
758
- history_latents = torch.cat([pad_zeros, history_latents], dim=2)
759
-
760
- clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -19:, :, :].split([16, 2, 1], dim=2)
761
- # clean_latents ๋Š” [start_latent + clean_latents_1x], ์ฆ‰ 1ํ”„๋ ˆ์ž„ ์ •๋„๋งŒ ์—ฐ๊ฒฐ
762
- clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
763
- except Exception as e:
764
- err = f"Indices prep error: {e}"
765
- print(err)
766
- traceback.print_exc()
767
- stream.output_queue.push(('error', err))
768
- stream.output_queue.push(('end', None))
769
- return
770
 
771
- # ์ง„์งœ ์ƒ˜ํ”Œ๋ง
772
  try:
773
  generated_latents = sample_hunyuan(
774
  transformer=transformer,
@@ -782,17 +1183,17 @@ def worker(
782
  num_inference_steps=steps,
783
  generator=rnd,
784
  prompt_embeds=llama_vec,
785
- prompt_embeds_mask=llama_attention_mask,
786
  prompt_poolers=clip_l_pooler,
787
  negative_prompt_embeds=llama_vec_n,
788
- negative_prompt_embeds_mask=llama_attention_mask_n,
789
  negative_prompt_poolers=clip_l_pooler_n,
790
- device=device,
791
- dtype=transformer.dtype,
792
  image_embeddings=image_encoder_last_hidden_state,
793
  latent_indices=latent_indices,
794
  clean_latents=clean_latents,
795
- clean_latent_indices=torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1),
796
  clean_latents_2x=clean_latents_2x,
797
  clean_latent_2x_indices=clean_latent_2x_indices,
798
  clean_latents_4x=clean_latents_4x,
@@ -800,251 +1201,116 @@ def worker(
800
  callback=callback
801
  )
802
  except KeyboardInterrupt:
803
- print("User stopped generation.")
804
- err = "User stopped generation, partial video returned."
805
- if last_output_filename:
806
- stream.output_queue.push(('file', last_output_filename))
807
- stream.output_queue.push(('error', err))
808
- stream.output_queue.push(('end', None))
809
- return
810
- except Exception as e:
811
- print(f"Sampling error: {e}")
812
- traceback.print_exc()
813
- if last_output_filename:
814
- err = f"Error during sampling, partial video returned: {e}"
815
- stream.output_queue.push(('file', last_output_filename))
816
- stream.output_queue.push(('error', err))
817
- else:
818
- err = f"Error during sampling: {e}"
819
- stream.output_queue.push(('error', err))
820
  stream.output_queue.push(('end', None))
821
  return
822
-
823
- try:
824
- # history_latents ๋’ค์— ๋ถ™์ด๊ธฐ
825
- total_generated_latent_frames += generated_latents.shape[2]
826
- history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
827
  except Exception as e:
828
- err = f"Concat history_latents error: {e}"
829
- print(err)
830
  traceback.print_exc()
831
- stream.output_queue.push(('error', err))
832
  stream.output_queue.push(('end', None))
833
  return
834
 
835
- # ๋ชจ๋ธ ์˜คํ”„๋กœ๋”ฉ / VAE ๋กœ๋“œ
836
- if not high_vram and not cpu_fallback_mode:
837
- try:
838
- offload_model_from_device_for_memory_preservation(transformer, target_device=device, preserved_memory_gb=8)
839
- load_model_as_complete(vae, target_device=device)
840
- except Exception as e:
841
- print(f"Model memory manage error: {e}")
842
 
843
- # VAE ๋””์ฝ”๋“œ & ๊ฒฐ๊ณผ ์ €์žฅ
844
- try:
845
- real_history_latents = history_latents # ๋ชจ๋“  ํ”„๋ ˆ์ž„
846
-
847
- # ์ฒ˜์Œ ๋””์ฝ”๋“œ ์‹œ
848
- if history_pixels is None:
849
- history_pixels = vae_decode(real_history_latents, vae).cpu()
850
- else:
851
- # ์•ž๋’ค ์ค‘๋ณต ํ”„๋ ˆ์ž„ ์—ฐ๊ฒฐ(๋‹จ์ˆœ Append).
852
- # ์—ฌ๊ธฐ์„œ๋Š” 2๋ฒˆ์งธ ์˜ˆ์‹œ์˜ soft_append_bcthw ๋ฐฉ์‹์„ ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉ
853
- # frames_per_section = latent_window_size*4 - 3
854
- # ์ค‘๋ณต(overlapped_frames)๋„ ๋™์ผ: frames_per_section
855
- # ๋‹ค๋งŒ, ์‹ค์ œ๋ก  ์ฒซ ์„น์…˜์—” ์ค‘๋ณต์ด ๊ฑฐ์˜ ์—†์„ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ์•ˆ์ „ํ•˜๊ฒŒ min์ฒ˜๋ฆฌ
856
- overlapped_frames = frames_per_section
857
- current_pixels = vae_decode(real_history_latents[:, :, -frames_per_section:], vae).cpu()
858
- history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
859
-
860
- output_filename = os.path.join(
861
- outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4'
862
- )
863
- save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
864
- last_output_filename = output_filename
865
- stream.output_queue.push(('file', output_filename))
866
- except Exception as e:
867
- print(f"Video decode/save error: {e}")
868
- traceback.print_exc()
869
- if last_output_filename:
870
- stream.output_queue.push(('file', last_output_filename))
871
- err = f"Video decode/save error: {e}"
872
- stream.output_queue.push(('error', err))
873
- continue
874
-
875
- # for๋ฌธ ์ข…๋ฃŒ
876
- except Exception as e:
877
- print(f"Outer error: {e}, type={type(e)}")
878
- traceback.print_exc()
879
- if not high_vram and not cpu_fallback_mode:
880
- try:
881
- unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
882
- except Exception as ue:
883
- print(f"Unload error: {ue}")
884
 
885
- if last_output_filename:
886
- stream.output_queue.push(('file', last_output_filename))
887
- err = f"Error in worker: {e}"
888
- stream.output_queue.push(('error', err))
889
 
890
- print("Worker finished, pushing 'end'.")
891
- stream.output_queue.push(('end', None))
 
 
 
 
 
892
 
 
 
893
 
894
- # Gradio ๋‚ด์—์„œ Spaces GPU๋ฅผ ์“ฐ๋Š”์ง€ ์—ฌ๋ถ€์— ๋”ฐ๋ผ process ํ•จ์ˆ˜๋ฅผ ๊ฐ์‹ธ๋Š” ๋กœ์ง
895
- if IN_HF_SPACE and 'spaces' in globals():
896
- @spaces.GPU
897
- def process_with_gpu(
898
- input_image, prompt, n_prompt, seed,
899
- total_second_length, latent_window_size, steps,
900
- cfg, gs, rs, gpu_memory_preservation, use_teacache
901
- ):
902
- global stream
903
- assert input_image is not None, "No input image given."
904
-
905
- # ์ดˆ๊ธฐํ™”
906
- yield None, None, "", "", gr.update(interactive=False), gr.update(interactive=True)
907
- try:
908
- stream = AsyncStream()
909
- async_run(
910
- worker,
911
- input_image, prompt, n_prompt, seed,
912
- total_second_length, latent_window_size, steps, cfg, gs, rs,
913
- gpu_memory_preservation, use_teacache
914
- )
915
 
916
- output_filename = None
917
- prev_output_filename = None
918
- error_message = None
919
-
920
- while True:
921
- flag, data = stream.output_queue.next()
922
- if flag == 'file':
923
- output_filename = data
924
- prev_output_filename = output_filename
925
- yield output_filename, gr.update(), gr.update(), '', gr.update(interactive=False), gr.update(interactive=True)
926
-
927
- elif flag == 'progress':
928
- preview, desc, html = data
929
- yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
930
-
931
- elif flag == 'error':
932
- error_message = data
933
- print(f"Got error: {error_message}")
934
-
935
- elif flag == 'end':
936
- if output_filename is None and prev_output_filename:
937
- output_filename = prev_output_filename
938
- if error_message:
939
- err_html = create_error_html(error_message)
940
- yield (
941
- output_filename, gr.update(visible=False), gr.update(),
942
- err_html, gr.update(interactive=True), gr.update(interactive=False)
943
- )
944
- else:
945
- yield (
946
- output_filename, gr.update(visible=False), gr.update(),
947
- '', gr.update(interactive=True), gr.update(interactive=False)
948
- )
949
- break
950
- except Exception as e:
951
- print(f"Start process error: {e}")
952
- traceback.print_exc()
953
- err_html = create_error_html(str(e))
954
- yield None, gr.update(visible=False), gr.update(), err_html, gr.update(interactive=True), gr.update(interactive=False)
955
-
956
- process = process_with_gpu
957
- else:
958
- def process(
959
- input_image, prompt, n_prompt, seed,
960
- total_second_length, latent_window_size, steps,
961
- cfg, gs, rs, gpu_memory_preservation, use_teacache
962
- ):
963
- global stream
964
- assert input_image is not None, "No input image given."
965
-
966
- yield None, None, "", "", gr.update(interactive=False), gr.update(interactive=True)
967
- try:
968
- stream = AsyncStream()
969
- async_run(
970
- worker,
971
- input_image, prompt, n_prompt, seed,
972
- total_second_length, latent_window_size, steps, cfg, gs, rs,
973
- gpu_memory_preservation, use_teacache
974
- )
975
 
976
- output_filename = None
977
- prev_output_filename = None
978
- error_message = None
979
-
980
- while True:
981
- flag, data = stream.output_queue.next()
982
- if flag == 'file':
983
- output_filename = data
984
- prev_output_filename = output_filename
985
- yield output_filename, gr.update(), gr.update(), '', gr.update(interactive=False), gr.update(interactive=True)
986
-
987
- elif flag == 'progress':
988
- preview, desc, html = data
989
- yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
990
-
991
- elif flag == 'error':
992
- error_message = data
993
- print(f"Got error: {error_message}")
994
-
995
- elif flag == 'end':
996
- if output_filename is None and prev_output_filename:
997
- output_filename = prev_output_filename
998
- if error_message:
999
- err_html = create_error_html(error_message)
1000
- yield (
1001
- output_filename, gr.update(visible=False), gr.update(),
1002
- err_html, gr.update(interactive=True), gr.update(interactive=False)
1003
- )
1004
- else:
1005
- yield (
1006
- output_filename, gr.update(visible=False), gr.update(),
1007
- '', gr.update(interactive=True), gr.update(interactive=False)
1008
- )
1009
- break
1010
- except Exception as e:
1011
- print(f"Start process error: {e}")
1012
- traceback.print_exc()
1013
- err_html = create_error_html(str(e))
1014
- yield None, gr.update(visible=False), gr.update(), err_html, gr.update(interactive=True), gr.update(interactive=False)
1015
 
 
 
1016
 
1017
  def end_process():
1018
  """
1019
- Stop generation by pushing 'end' to the worker queue
1020
  """
1021
- print("User clicked stop, sending 'end' signal...")
1022
  global stream
1023
- if 'stream' in globals() and stream is not None:
1024
- try:
1025
- top_signal = stream.input_queue.top()
1026
- print(f"Queue top signal = {top_signal}")
1027
- except Exception as e:
1028
- print(f"Error checking queue top: {e}")
1029
- try:
1030
- stream.input_queue.push('end')
1031
- print("Pushed 'end' successfully.")
1032
- except Exception as e:
1033
- print(f"Error pushing 'end': {e}")
1034
- else:
1035
- print("Warning: Stream not initialized, cannot stop.")
1036
- return None
1037
 
1038
- # ์˜ˆ์‹œ ๋น ๋ฅธ ํ”„๋กฌํ”„ํŠธ
1039
- quick_prompts = [
1040
- ["The girl dances gracefully, with clear movements, full of charm."],
1041
- ["A character doing some simple body movements."]
1042
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1043
 
 
1044
  def make_custom_css():
1045
  base_progress_css = make_progress_bar_css()
1046
  pastel_css = """
1047
- /* ํŒŒ์Šคํ…” ํ†ค, ์ข€ ๋” ๋ถ€๋“œ๋Ÿฝ๊ณ  ์„ธ๋ จ๋œ UI ์Šคํƒ€์ผ */
1048
  body {
1049
  background: #faf9ff !important;
1050
  font-family: "Noto Sans", sans-serif;
@@ -1105,17 +1371,6 @@ def make_custom_css():
1105
  margin-top: 10px;
1106
  font-weight: 500;
1107
  }
1108
- .error-icon {
1109
- color: #E53E3E;
1110
- margin-right: 8px;
1111
- }
1112
- #error-message {
1113
- color: #ff4444;
1114
- font-weight: bold;
1115
- padding: 10px;
1116
- border-radius: 4px;
1117
- margin-top: 10px;
1118
- }
1119
  @media (max-width: 768px) {
1120
  #app-container {
1121
  padding: 0.5rem;
@@ -1132,22 +1387,29 @@ def make_custom_css():
1132
 
1133
  css = make_custom_css()
1134
 
 
 
 
 
 
 
1135
  # Gradio UI
1136
  block = gr.Blocks(css=css).queue()
1137
  with block:
1138
- # ์ƒ๋‹จ ์ œ๋ชฉ
1139
  gr.HTML("<div id='app-container'><h1>FramePack - Image to Video Generation</h1></div>")
1140
 
1141
  with gr.Row(elem_classes="mobile-full-width"):
 
1142
  with gr.Column(scale=1, elem_classes="gr-panel"):
1143
  input_image = gr.Image(
1144
  label=get_translation("upload_image"),
1145
- sources='upload',
1146
  type="numpy",
1147
- elem_id="input-image",
1148
  height=320
1149
  )
1150
- prompt = gr.Textbox(label=get_translation("prompt"), value='', elem_id="prompt-input")
 
 
 
1151
 
1152
  example_quick_prompts = gr.Dataset(
1153
  samples=quick_prompts,
@@ -1162,6 +1424,8 @@ with block:
1162
  show_progress=False,
1163
  queue=False
1164
  )
 
 
1165
  with gr.Column(scale=1, elem_classes="gr-panel"):
1166
  with gr.Row(elem_classes="button-container"):
1167
  start_button = gr.Button(
@@ -1169,19 +1433,18 @@ with block:
1169
  elem_id="start-button",
1170
  variant="primary"
1171
  )
1172
- end_button = gr.Button(
1173
  value=get_translation("stop_generation"),
1174
  elem_id="stop-button",
1175
  interactive=False
1176
  )
1177
-
1178
  result_video = gr.Video(
1179
  label=get_translation("generated_video"),
1180
  autoplay=True,
1181
  loop=True,
1182
  height=320,
1183
- elem_classes="video-container",
1184
- elem_id="result-video"
1185
  )
1186
  preview_image = gr.Image(
1187
  label=get_translation("next_latents"),
@@ -1189,16 +1452,15 @@ with block:
1189
  height=150,
1190
  elem_classes="preview-container"
1191
  )
1192
-
1193
  gr.Markdown(get_translation("sampling_note"))
1194
-
1195
  with gr.Group(elem_classes="progress-container"):
1196
  progress_desc = gr.Markdown('')
1197
  progress_bar = gr.HTML('')
1198
-
1199
- error_message = gr.HTML('', elem_id='error-message', visible=True)
1200
 
1201
- # ๊ณ ๊ธ‰ ํŒŒ๋ผ๋ฏธํ„ฐ Accordion
 
 
1202
  with gr.Accordion("Advanced Settings", open=False, elem_classes="gr-panel"):
1203
  use_teacache = gr.Checkbox(
1204
  label=get_translation("use_teacache"),
@@ -1211,7 +1473,7 @@ with block:
1211
  value=31337,
1212
  precision=0
1213
  )
1214
- # ๊ธฐ๋ณธ๊ฐ’(value) = 2, ์ตœ๋Œ€๊ฐ’(maximum) = 4
1215
  total_second_length = gr.Slider(
1216
  label=get_translation("video_length"),
1217
  minimum=1,
@@ -1268,17 +1530,17 @@ with block:
1268
  info=get_translation("gpu_memory_info")
1269
  )
1270
 
1271
- # ๋ฒ„ํŠผ ๋™์ž‘
1272
- ips = [
1273
  input_image, prompt, n_prompt, seed,
1274
  total_second_length, latent_window_size, steps,
1275
  cfg, gs, rs, gpu_memory_preservation, use_teacache
1276
  ]
1277
  start_button.click(
1278
  fn=process,
1279
- inputs=ips,
1280
- outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button]
1281
  )
1282
- end_button.click(fn=end_process)
1283
 
1284
  block.launch()
 
1
+ #############################################
2
  # from diffusers_helper.hf_login import login
3
+ # ํ•„์š”์‹œ HF ๋กœ๊ทธ์ธ ์‚ฌ์šฉ (์ฃผ์„ ํ•ด์ œ ํ›„)
4
+ #############################################
5
 
6
  import os
7
+
8
+ os.environ['HF_HOME'] = os.path.abspath(
9
+ os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download'))
10
+ )
11
+
12
+ import gradio as gr
13
+ import torch
14
+ import traceback
15
+ import einops
16
+ import safetensors.torch as sf
17
+ import numpy as np
18
+ import math
19
  import time
20
+
21
+ # Hugging Face Spaces ํ™˜๊ฒฝ ์ธ์ง€ ํ™•์ธ
22
+ IN_HF_SPACE = os.environ.get('SPACE_ID') is not None
23
+
24
+ # --------- ๋ฒˆ์—ญ ๋”•์…”๋„ˆ๋ฆฌ(์˜์–ด ๊ณ ์ •) ---------
25
+ translations = {
26
+ "en": {
27
+ "title": "FramePack - Image to Video Generation",
28
+ "upload_image": "Upload Image",
29
+ "prompt": "Prompt",
30
+ "quick_prompts": "Quick Prompts",
31
+ "start_generation": "Generate",
32
+ "stop_generation": "Stop",
33
+ "use_teacache": "Use TeaCache",
34
+ "teacache_info": "Faster speed, but may result in slightly worse finger and hand generation.",
35
+ "negative_prompt": "Negative Prompt",
36
+ "seed": "Seed",
37
+ # ์ตœ๋Œ€ 4์ดˆ๋กœ UI ํ‘œ๊ธฐ ์ˆ˜์ •
38
+ "video_length": "Video Length (max 4 seconds)",
39
+ "latent_window": "Latent Window Size",
40
+ "steps": "Inference Steps",
41
+ "steps_info": "Changing this value is not recommended.",
42
+ "cfg_scale": "CFG Scale",
43
+ "distilled_cfg": "Distilled CFG Scale",
44
+ "distilled_cfg_info": "Changing this value is not recommended.",
45
+ "cfg_rescale": "CFG Rescale",
46
+ "gpu_memory": "GPU Memory Preservation (GB) (larger means slower)",
47
+ "gpu_memory_info": "Set this to a larger value if you encounter OOM errors. Larger values cause slower speed.",
48
+ "next_latents": "Next Latents",
49
+ "generated_video": "Generated Video",
50
+ "sampling_note": "Note: The model predicts future frames from past frames. If the start action isn't immediately visible, please wait for more frames.",
51
+ "error_message": "Error",
52
+ "processing_error": "Processing error",
53
+ "network_error": "Network connection is unstable, model download timed out. Please try again later.",
54
+ "memory_error": "GPU memory insufficient, please try increasing GPU memory preservation value or reduce video length.",
55
+ "model_error": "Failed to load model, possibly due to network issues or high server load. Please try again later.",
56
+ "partial_video": "Processing error, but partial video has been generated",
57
+ "processing_interrupt": "Processing was interrupted, but partial video has been generated"
58
+ }
59
+ }
60
+
61
+ def get_translation(key):
62
+ return translations["en"].get(key, key)
63
+
64
+ #############################################
65
+ # diffusers_helper ๊ด€๋ จ ์ž„ํฌํŠธ
66
+ #############################################
67
+ from diffusers_helper.thread_utils import AsyncStream, async_run
68
+ from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
69
+ from diffusers_helper.memory import (
70
+ cpu,
71
+ gpu,
72
+ get_cuda_free_memory_gb,
73
+ move_model_to_device_with_memory_preservation,
74
+ offload_model_from_device_for_memory_preservation,
75
+ fake_diffusers_current_device,
76
+ DynamicSwapInstaller,
77
+ unload_complete_models,
78
+ load_model_as_complete
79
+ )
80
+ from diffusers_helper.utils import (
81
+ generate_timestamp,
82
+ save_bcthw_as_mp4,
83
+ resize_and_center_crop,
84
+ crop_or_pad_yield_mask,
85
+ soft_append_bcthw
86
+ )
87
+ from diffusers_helper.bucket_tools import find_nearest_bucket
88
+ from diffusers_helper.hunyuan import (
89
+ encode_prompt_conds, vae_encode, vae_decode, vae_decode_fake
90
+ )
91
+ from diffusers_helper.clip_vision import hf_clip_vision_encode
92
+ from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
93
+ from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
94
+
95
+ from diffusers import AutoencoderKLHunyuanVideo
96
+ from transformers import (
97
+ LlamaModel, CLIPTextModel,
98
+ LlamaTokenizerFast, CLIPTokenizer,
99
+ SiglipVisionModel, SiglipImageProcessor
100
+ )
101
+
102
+ #############################################
103
+ # GPU ์ฒดํฌ
104
+ #############################################
105
+ GPU_AVAILABLE = torch.cuda.is_available()
106
+ free_mem_gb = 0.0
107
+ high_vram = False
108
+ if GPU_AVAILABLE:
109
+ try:
110
+ free_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
111
+ high_vram = (free_mem_gb > 60)
112
+ except:
113
+ pass
114
+ print(f"GPU Available: {GPU_AVAILABLE}, free_mem_gb={free_mem_gb}, high_vram={high_vram}")
115
+
116
+ cpu_fallback_mode = not GPU_AVAILABLE
117
+ last_update_time = time.time()
118
+
119
+ #############################################
120
+ # ๋ชจ๋ธ ๋กœ๋“œ (์ „์—ญ)
121
+ #############################################
122
+ text_encoder = None
123
+ text_encoder_2 = None
124
+ tokenizer = None
125
+ tokenizer_2 = None
126
+ vae = None
127
+ feature_extractor = None
128
+ image_encoder = None
129
+ transformer = None
130
+
131
+ # ์•„๋ž˜ ๋กœ์ง์€ ์งˆ๋ฌธ์— ์ œ์‹œ๋œ '๋‘ ๋ฒˆ์งธ ์ฝ”๋“œ'์˜ ๋ชจ๋ธ ๋กœ๋“œ ๋ถ€๋ถ„์„ ๊ฑฐ์˜ ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉ
132
+ def load_global_models():
133
+ global text_encoder, text_encoder_2, tokenizer, tokenizer_2
134
+ global vae, feature_extractor, image_encoder, transformer
135
+ global cpu_fallback_mode
136
+
137
+ # ์ด๋ฏธ ๋กœ๋“œ๋˜์—ˆ์œผ๋ฉด ํŒจ์Šค
138
+ if transformer is not None:
139
+ return
140
+
141
+ # GPU ๋ฉ”๋ชจ๋ฆฌ ์ •๋ณด
142
+ device = gpu if GPU_AVAILABLE else cpu
143
+
144
+ # diffusers_helper.memory.get_cuda_free_memory_gb(gpu)๋กœ ๋” ์ •ํ™•ํžˆ ๊ตฌํ•ด๋„ ๋จ
145
+ print("Loading models...")
146
+
147
+ # ======== ์‹ค ์ฝ”๋“œ: ๋‘ ๋ฒˆ์งธ ์˜ˆ์‹œ ๊ธฐ์ค€ =========
148
+ # (1) ํ•˜์ด๋ธŒ๋ฆฌ๋“œ (if high_vram -> GPU๋กœ ๋กœ๋“œ, ์•„๋‹ˆ๋ฉด CPU + DynamicSwap)
149
+
150
+ # ๋ฐ˜๋“œ์‹œ float16, bfloat16๋กœ ๋กœ๋“œ
151
+ text_encoder_local = LlamaModel.from_pretrained(
152
+ "hunyuanvideo-community/HunyuanVideo",
153
+ subfolder='text_encoder',
154
+ torch_dtype=torch.float16
155
+ ).cpu()
156
+
157
+ text_encoder_2_local = CLIPTextModel.from_pretrained(
158
+ "hunyuanvideo-community/HunyuanVideo",
159
+ subfolder='text_encoder_2',
160
+ torch_dtype=torch.float16
161
+ ).cpu()
162
+
163
+ tokenizer_local = LlamaTokenizerFast.from_pretrained(
164
+ "hunyuanvideo-community/HunyuanVideo",
165
+ subfolder='tokenizer'
166
+ )
167
+ tokenizer_2_local = CLIPTokenizer.from_pretrained(
168
+ "hunyuanvideo-community/HunyuanVideo",
169
+ subfolder='tokenizer_2'
170
+ )
171
+
172
+ vae_local = AutoencoderKLHunyuanVideo.from_pretrained(
173
+ "hunyuanvideo-community/HunyuanVideo",
174
+ subfolder='vae',
175
+ torch_dtype=torch.float16
176
+ ).cpu()
177
+
178
+ feature_extractor_local = SiglipImageProcessor.from_pretrained(
179
+ "lllyasviel/flux_redux_bfl", subfolder='feature_extractor'
180
+ )
181
+ image_encoder_local = SiglipVisionModel.from_pretrained(
182
+ "lllyasviel/flux_redux_bfl",
183
+ subfolder='image_encoder',
184
+ torch_dtype=torch.float16
185
+ ).cpu()
186
+
187
+ # FramePack_F1_I2V_HY_20250503 (bfloat16)
188
+ transformer_local = HunyuanVideoTransformer3DModelPacked.from_pretrained(
189
+ 'lllyasviel/FramePack_F1_I2V_HY_20250503',
190
+ torch_dtype=torch.bfloat16
191
+ ).cpu()
192
+
193
+ # eval & dtype
194
+ vae_local.eval()
195
+ text_encoder_local.eval()
196
+ text_encoder_2_local.eval()
197
+ image_encoder_local.eval()
198
+ transformer_local.eval()
199
+
200
+ # VAE slicing for low VRAM
201
+ if not high_vram:
202
+ vae_local.enable_slicing()
203
+ vae_local.enable_tiling()
204
+
205
+ # ์˜คํ”„๋กœ๋“œ์šฉ
206
+ transformer_local.high_quality_fp32_output_for_inference = True
207
+ transformer_local.to(dtype=torch.bfloat16)
208
+ vae_local.to(dtype=torch.float16)
209
+ image_encoder_local.to(dtype=torch.float16)
210
+ text_encoder_local.to(dtype=torch.float16)
211
+ text_encoder_2_local.to(dtype=torch.float16)
212
+
213
+ # requires_grad_(False)
214
+ for m in [vae_local, text_encoder_local, text_encoder_2_local, image_encoder_local, transformer_local]:
215
+ m.requires_grad_(False)
216
+
217
+ # GPU ๋ชจ๋“œ & VRAM ๋งŽ์œผ๋ฉด ์ „๋ถ€ GPU
218
+ # ๊ทธ๋ ‡์ง€ ์•Š์œผ๋ฉด DynamicSwap
219
+ if GPU_AVAILABLE:
220
+ if not high_vram:
221
+ DynamicSwapInstaller.install_model(transformer_local, device=gpu)
222
+ DynamicSwapInstaller.install_model(text_encoder_local, device=gpu)
223
+ else:
224
+ text_encoder_local.to(gpu)
225
+ text_encoder_2_local.to(gpu)
226
+ image_encoder_local.to(gpu)
227
+ vae_local.to(gpu)
228
+ transformer_local.to(gpu)
229
+ else:
230
+ cpu_fallback_mode = True
231
+
232
+ # ๊ธ€๋กœ๋ฒŒ์— ํ• ๋‹น
233
+ print("Model loaded.")
234
+ text_encoder = text_encoder_local
235
+ text_encoder_2 = text_encoder_2_local
236
+ tokenizer = tokenizer_local
237
+ tokenizer_2 = tokenizer_2_local
238
+ vae = vae_local
239
+ feature_extractor = feature_extractor_local
240
+ image_encoder = image_encoder_local
241
+ transformer = transformer_local
242
+
243
+ #############################################
244
+ # Worker ๋กœ์ง (๋‘ ๋ฒˆ์งธ ์ฝ”๋“œ) ๊ทธ๋Œ€๋กœ
245
+ #############################################
246
+ stream = AsyncStream()
247
+
248
+ outputs_folder = './outputs/'
249
+ os.makedirs(outputs_folder, exist_ok=True)
250
+
251
+ @torch.no_grad()
252
+ def worker(
253
+ input_image, prompt, n_prompt, seed,
254
+ total_second_length, latent_window_size, steps,
255
+ cfg, gs, rs, gpu_memory_preservation, use_teacache
256
+ ):
257
+ """
258
+ ์‹ค์ œ ์ƒ˜ํ”Œ๋ง ๋กœ์ง (๋‘ ๋ฒˆ์งธ ์ฝ”๋“œ ๊ธฐ๋ฐ˜)
259
+ """
260
+ load_global_models() # ๋ชจ๋ธ ๋กœ๋”ฉ
261
+ global text_encoder, text_encoder_2, tokenizer, tokenizer_2
262
+ global vae, feature_extractor, image_encoder, transformer
263
+ global last_update_time
264
+
265
+ # ์ตœ๋Œ€ 4์ดˆ๋กœ ๊ณ ์ •
266
+ total_second_length = min(total_second_length, 4.0)
267
+
268
+ total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
269
+ total_latent_sections = int(max(round(total_latent_sections), 1))
270
+
271
+ job_id = generate_timestamp()
272
+
273
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
274
+
275
+ try:
276
+ # GPU ์ ์„ ๊ฒฝ์šฐ Unload
277
+ if not high_vram and GPU_AVAILABLE:
278
+ unload_complete_models(
279
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
280
+ )
281
+
282
+ # Text encoding
283
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
284
+
285
+ if not high_vram and GPU_AVAILABLE:
286
+ fake_diffusers_current_device(text_encoder, gpu)
287
+ load_model_as_complete(text_encoder_2, target_device=gpu)
288
+
289
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
290
+ if cfg == 1.0:
291
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
292
+ else:
293
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
294
+
295
+ llama_vec, llama_mask = crop_or_pad_yield_mask(llama_vec, length=512)
296
+ llama_vec_n, llama_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
297
+
298
+ # Image processing
299
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
300
+
301
+ H, W, C = input_image.shape
302
+ height, width = find_nearest_bucket(H, W, resolution=640)
303
+
304
+ if cpu_fallback_mode:
305
+ height = min(height, 320)
306
+ width = min(width, 320)
307
+
308
+ input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
309
+
310
+ Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
311
+
312
+ input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
313
+ input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
314
+
315
+ # VAE encode
316
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
317
+
318
+ if not high_vram and GPU_AVAILABLE:
319
+ load_model_as_complete(vae, target_device=gpu)
320
+ start_latent = vae_encode(input_image_pt, vae)
321
+
322
+ # CLIP Vision
323
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
324
+
325
+ if not high_vram and GPU_AVAILABLE:
326
+ load_model_as_complete(image_encoder, target_device=gpu)
327
+ image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
328
+ image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
329
+
330
+ # dtype
331
+ llama_vec = llama_vec.to(transformer.dtype)
332
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
333
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
334
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
335
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
336
+
337
+ # Start sampling
338
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
339
+
340
+ rnd = torch.Generator("cpu").manual_seed(seed)
341
+
342
+ # ์ดˆ๊ธฐ history latents
343
+ history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
344
+ history_pixels = None
345
+
346
+ # start_latent ๋ถ™์ด๊ธฐ
347
+ history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
348
+ total_generated_latent_frames = 1
349
+
350
+ for section_index in range(total_latent_sections):
351
+ if stream.input_queue.top() == 'end':
352
+ stream.output_queue.push(('end', None))
353
+ return
354
+
355
+ print(f'Section {section_index+1}/{total_latent_sections}')
356
+
357
+ if not high_vram and GPU_AVAILABLE:
358
+ unload_complete_models()
359
+ move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
360
+
361
+ # teacache
362
+ if use_teacache:
363
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
364
+ else:
365
+ transformer.initialize_teacache(enable_teacache=False)
366
+
367
+ def callback(d):
368
+ preview = d['denoised']
369
+ preview = vae_decode_fake(preview)
370
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
371
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
372
+
373
+ if stream.input_queue.top() == 'end':
374
+ stream.output_queue.push(('end', None))
375
+ raise KeyboardInterrupt('User stops generation.')
376
+
377
+ current_step = d['i'] + 1
378
+ percentage = int(100.0 * current_step / steps)
379
+ hint = f'Sampling {current_step}/{steps}'
380
+ desc = f'Section {section_index+1}/{total_latent_sections}'
381
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
382
+ return
383
+
384
+ # indices
385
+ frames_per_section = latent_window_size * 4 - 3
386
+ indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
387
+ (
388
+ clean_latent_indices_start,
389
+ clean_latent_4x_indices,
390
+ clean_latent_2x_indices,
391
+ clean_latent_1x_indices,
392
+ latent_indices
393
+ ) = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
394
+
395
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
396
+
397
+ clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -19:, :, :].split([16, 2, 1], dim=2)
398
+ clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
399
+
400
+ try:
401
+ generated_latents = sample_hunyuan(
402
+ transformer=transformer,
403
+ sampler='unipc',
404
+ width=width,
405
+ height=height,
406
+ frames=frames_per_section,
407
+ real_guidance_scale=cfg,
408
+ distilled_guidance_scale=gs,
409
+ guidance_rescale=rs,
410
+ num_inference_steps=steps,
411
+ generator=rnd,
412
+ prompt_embeds=llama_vec,
413
+ prompt_embeds_mask=llama_mask,
414
+ prompt_poolers=clip_l_pooler,
415
+ negative_prompt_embeds=llama_vec_n,
416
+ negative_prompt_embeds_mask=llama_mask_n,
417
+ negative_prompt_poolers=clip_l_pooler_n,
418
+ device=gpu if GPU_AVAILABLE else cpu,
419
+ dtype=torch.bfloat16,
420
+ image_embeddings=image_encoder_last_hidden_state,
421
+ latent_indices=latent_indices,
422
+ clean_latents=clean_latents,
423
+ clean_latent_indices=clean_latent_indices,
424
+ clean_latents_2x=clean_latents_2x,
425
+ clean_latent_2x_indices=clean_latent_2x_indices,
426
+ clean_latents_4x=clean_latents_4x,
427
+ clean_latent_4x_indices=clean_latent_4x_indices,
428
+ callback=callback
429
+ )
430
+ except KeyboardInterrupt:
431
+ print("User cancelled.")
432
+ stream.output_queue.push(('end', None))
433
+ return
434
+ except Exception as e:
435
+ traceback.print_exc()
436
+ stream.output_queue.push(('end', None))
437
+ return
438
+
439
+ total_generated_latent_frames += generated_latents.shape[2]
440
+ history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
441
+
442
+ if not high_vram and GPU_AVAILABLE:
443
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
444
+ load_model_as_complete(vae, target_device=gpu)
445
+
446
+ real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
447
+
448
+ if history_pixels is None:
449
+ history_pixels = vae_decode(real_history_latents, vae).cpu()
450
+ else:
451
+ section_latent_frames = latent_window_size * 2
452
+ overlapped_frames = frames_per_section
453
+ current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
454
+ history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
455
+
456
+ if not high_vram and GPU_AVAILABLE:
457
+ unload_complete_models()
458
+
459
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
460
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=16) # CRF=16
461
+
462
+ stream.output_queue.push(('file', output_filename))
463
+
464
+ except:
465
+ traceback.print_exc()
466
+ if not high_vram and GPU_AVAILABLE:
467
+ unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
468
+
469
+ stream.output_queue.push(('end', None))
470
+ return
471
+
472
+ def end_process():
473
+ """
474
+ ์ค‘๋‹จ ์š”์ฒญ
475
+ """
476
+ global stream
477
+ stream.input_queue.push('end')
478
+
479
+ # Gradio์—์„œ ์ด worker ํ•จ์ˆ˜๋ฅผ ๋น„๋™๊ธฐ๋กœ ํ˜ธ์ถœ
480
+ def process(
481
+ input_image, prompt, n_prompt, seed,
482
+ total_second_length, latent_window_size, steps,
483
+ cfg, gs, rs, gpu_memory_preservation, use_teacache
484
+ ):
485
+ global stream
486
+ if input_image is None:
487
+ raise ValueError("No input image provided.")
488
+
489
+ yield None, None, "", "", gr.update(interactive=False), gr.update(interactive=True)
490
+
491
+ stream = AsyncStream()
492
+ async_run(
493
+ worker,
494
+ input_image, prompt, n_prompt, seed,
495
+ total_second_length, latent_window_size, steps,
496
+ cfg, gs, rs, gpu_memory_preservation, use_teacache
497
+ )
498
+
499
+ output_filename = None
500
+ prev_filename = None
501
+ error_message = None
502
+
503
+ while True:
504
+ flag, data = stream.output_queue.next()
505
+ if flag == 'file':
506
+ output_filename = data
507
+ prev_filename = output_filename
508
+ yield output_filename, gr.update(), gr.update(), "", gr.update(interactive=False), gr.update(interactive=True)
509
+
510
+ elif flag == 'progress':
511
+ preview, desc, html = data
512
+ yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
513
+
514
+ elif flag == 'error':
515
+ error_message = data
516
+ print(f"Error: {error_message}")
517
+
518
+ elif flag == 'end':
519
+ if output_filename is None and prev_filename:
520
+ output_filename = prev_filename
521
+ # ์—๋Ÿฌ๊ฐ€ ์žˆ์—ˆ์œผ๋ฉด ์—๋Ÿฌ ํ‘œ์‹œ
522
+ if error_message:
523
+ yield (
524
+ output_filename, # ๋งˆ์ง€๋ง‰ ํŒŒ์ผ (๋˜๋Š” None)
525
+ gr.update(visible=False),
526
+ gr.update(),
527
+ f"<div style='color:red;'>{error_message}</div>",
528
+ gr.update(interactive=True),
529
+ gr.update(interactive=False)
530
+ )
531
+ else:
532
+ yield (
533
+ output_filename, gr.update(visible=False), gr.update(), "", gr.update(interactive=True), gr.update(interactive=False)
534
+ )
535
+ break
536
+
537
+ # UI CSS
538
+ def make_custom_css():
539
+ base_progress_css = make_progress_bar_css()
540
+ pastel_css = """
541
+ body {
542
+ background: #faf9ff !important;
543
+ font-family: "Noto Sans", sans-serif;
544
+ }
545
+ #app-container {
546
+ max-width: 1200px;
547
+ margin: 0 auto;
548
+ padding: 1rem;
549
+ position: relative;
550
+ }
551
+ #app-container h1 {
552
+ color: #5F5AA2;
553
+ margin-bottom: 1.2rem;
554
+ font-weight: 700;
555
+ text-shadow: 1px 1px 2px #bbb;
556
+ }
557
+ .gr-panel {
558
+ background: #ffffffcc;
559
+ border: 1px solid #e1dff0;
560
+ border-radius: 8px;
561
+ padding: 1rem;
562
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1);
563
+ }
564
+ .button-container button {
565
+ min-height: 45px;
566
+ font-size: 1rem;
567
+ font-weight: 600;
568
+ border-radius: 6px;
569
+ }
570
+ .button-container button#start-button {
571
+ background-color: #A289E3 !important;
572
+ color: #fff !important;
573
+ border: 1px solid #a58de2;
574
+ }
575
+ .button-container button#stop-button {
576
+ background-color: #F48A9B !important;
577
+ color: #fff !important;
578
+ border: 1px solid #f18fa0;
579
+ }
580
+ .button-container button:hover {
581
+ filter: brightness(0.95);
582
+ }
583
+ .preview-container, .video-container {
584
+ border: 1px solid #ded9f2;
585
+ border-radius: 8px;
586
+ overflow: hidden;
587
+ }
588
+ .progress-container {
589
+ margin-top: 15px;
590
+ margin-bottom: 15px;
591
+ }
592
+ .error-message {
593
+ background-color: #FFF5F5;
594
+ border: 1px solid #FED7D7;
595
+ color: #E53E3E;
596
+ padding: 10px;
597
+ border-radius: 4px;
598
+ margin-top: 10px;
599
+ font-weight: 500;
600
+ }
601
+ @media (max-width: 768px) {
602
+ #app-container {
603
+ padding: 0.5rem;
604
+ }
605
+ .mobile-full-width {
606
+ flex-direction: column !important;
607
+ }
608
+ .mobile-full-width > .gr-block {
609
+ width: 100% !important;
610
+ }
611
+ }
612
+ """
613
+ return base_progress_css + pastel_css
614
+
615
+ css = make_custom_css()
616
+
617
+ # ์ƒ˜ํ”Œ ํ”„๋กฌํ”„ํŠธ
618
+ quick_prompts = [
619
+ ["The girl dances gracefully, with clear movements, full of charm."],
620
+ ["A character doing some simple body movements."]
621
+ ]
622
+
623
+ # Gradio UI
624
+ block = gr.Blocks(css=css).queue()
625
+ with block:
626
+ gr.HTML("<div id='app-container'><h1>FramePack - Image to Video Generation</h1></div>")
627
+
628
+ with gr.Row(elem_classes="mobile-full-width"):
629
+ # ์™ผ์ชฝ
630
+ with gr.Column(scale=1, elem_classes="gr-panel"):
631
+ input_image = gr.Image(
632
+ label=get_translation("upload_image"),
633
+ type="numpy",
634
+ height=320
635
+ )
636
+ prompt = gr.Textbox(
637
+ label=get_translation("prompt"),
638
+ value=''
639
+ )
640
+
641
+ example_quick_prompts = gr.Dataset(
642
+ samples=quick_prompts,
643
+ label=get_translation("quick_prompts"),
644
+ samples_per_page=1000,
645
+ components=[prompt]
646
+ )
647
+ example_quick_prompts.click(
648
+ fn=lambda x: x[0],
649
+ inputs=[example_quick_prompts],
650
+ outputs=prompt,
651
+ show_progress=False,
652
+ queue=False
653
+ )
654
+
655
+ # ์˜ค๋ฅธ์ชฝ
656
+ with gr.Column(scale=1, elem_classes="gr-panel"):
657
+ with gr.Row(elem_classes="button-container"):
658
+ start_button = gr.Button(
659
+ value=get_translation("start_generation"),
660
+ elem_id="start-button",
661
+ variant="primary"
662
+ )
663
+ stop_button = gr.Button(
664
+ value=get_translation("stop_generation"),
665
+ elem_id="stop-button",
666
+ interactive=False
667
+ )
668
+
669
+ result_video = gr.Video(
670
+ label=get_translation("generated_video"),
671
+ autoplay=True,
672
+ loop=True,
673
+ height=320,
674
+ elem_classes="video-container"
675
+ )
676
+ preview_image = gr.Image(
677
+ label=get_translation("next_latents"),
678
+ visible=False,
679
+ height=150,
680
+ elem_classes="preview-container"
681
+ )
682
+ gr.Markdown(get_translation("sampling_note"))
683
+
684
+ with gr.Group(elem_classes="progress-container"):
685
+ progress_desc = gr.Markdown('')
686
+ progress_bar = gr.HTML('')
687
+
688
+ error_message = gr.HTML('', visible=True)
689
+
690
+ # Advanced
691
+ with gr.Accordion("Advanced Settings", open=False, elem_classes="gr-panel"):
692
+ use_teacache = gr.Checkbox(
693
+ label=get_translation("use_teacache"),
694
+ value=True,
695
+ info=get_translation("teacache_info")
696
+ )
697
+ n_prompt = gr.Textbox(label=get_translation("negative_prompt"), value="", visible=False)
698
+ seed = gr.Number(
699
+ label=get_translation("seed"),
700
+ value=31337,
701
+ precision=0
702
+ )
703
+ # ๊ธฐ๋ณธ 2์ดˆ, ์ตœ๋Œ€ 4์ดˆ
704
+ total_second_length = gr.Slider(
705
+ label=get_translation("video_length"),
706
+ minimum=1,
707
+ maximum=4,
708
+ value=2,
709
+ step=0.1
710
+ )
711
+ latent_window_size = gr.Slider(
712
+ label=get_translation("latent_window"),
713
+ minimum=1,
714
+ maximum=33,
715
+ value=9,
716
+ step=1,
717
+ visible=False
718
+ )
719
+ steps = gr.Slider(
720
+ label=get_translation("steps"),
721
+ minimum=1,
722
+ maximum=100,
723
+ value=25,
724
+ step=1,
725
+ info=get_translation("steps_info")
726
+ )
727
+ cfg = gr.Slider(
728
+ label=get_translation("cfg_scale"),
729
+ minimum=1.0,
730
+ maximum=32.0,
731
+ value=1.0,
732
+ step=0.01,
733
+ visible=False
734
+ )
735
+ gs = gr.Slider(
736
+ label=get_translation("distilled_cfg"),
737
+ minimum=1.0,
738
+ maximum=32.0,
739
+ value=10.0,
740
+ step=0.01,
741
+ info=get_translation("distilled_cfg_info")
742
+ )
743
+ rs = gr.Slider(
744
+ label=get_translation("cfg_rescale"),
745
+ minimum=0.0,
746
+ maximum=1.0,
747
+ value=0.0,
748
+ step=0.01,
749
+ visible=False
750
+ )
751
+ gpu_memory_preservation = gr.Slider(
752
+ label=get_translation("gpu_memory"),
753
+ minimum=6,
754
+ maximum=128,
755
+ value=6,
756
+ step=0.1,
757
+ info=get_translation("gpu_memory_info")
758
+ )
759
+
760
+ # ๋ฒ„ํŠผ ์ฒ˜๋ฆฌ
761
+ inputs_list = [
762
+ input_image, prompt, n_prompt, seed,
763
+ total_second_length, latent_window_size, steps,
764
+ cfg, gs, rs, gpu_memory_preservation, use_teacache
765
+ ]
766
+ start_button.click(
767
+ fn=process,
768
+ inputs=inputs_list,
769
+ outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, stop_button]
770
+ )
771
+ stop_button.click(fn=end_process)
772
+
773
+ block.launch()
774
+ #############################################
775
+ # from diffusers_helper.hf_login import login
776
+ # ํ•„์š”์‹œ HF ๋กœ๊ทธ์ธ ์‚ฌ์šฉ (์ฃผ์„ ํ•ด์ œ ํ›„)
777
+ #############################################
778
+
779
+ import os
780
 
781
  os.environ['HF_HOME'] = os.path.abspath(
782
  os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download'))
783
  )
784
 
785
+ import gradio as gr
786
+ import torch
787
+ import traceback
788
+ import einops
789
+ import safetensors.torch as sf
790
+ import numpy as np
791
+ import math
792
+ import time
793
+
794
+ # Hugging Face Spaces ํ™˜๊ฒฝ ์ธ์ง€ ํ™•์ธ
795
+ IN_HF_SPACE = os.environ.get('SPACE_ID') is not None
796
+
797
+ # --------- ๋ฒˆ์—ญ ๋”•์…”๋„ˆ๋ฆฌ(์˜์–ด ๊ณ ์ •) ---------
798
  translations = {
799
  "en": {
800
  "title": "FramePack - Image to Video Generation",
 
807
  "teacache_info": "Faster speed, but may result in slightly worse finger and hand generation.",
808
  "negative_prompt": "Negative Prompt",
809
  "seed": "Seed",
810
+ # ์ตœ๋Œ€ 4์ดˆ๋กœ UI ํ‘œ๊ธฐ ์ˆ˜์ •
811
  "video_length": "Video Length (max 4 seconds)",
812
  "latent_window": "Latent Window Size",
813
  "steps": "Inference Steps",
 
820
  "gpu_memory_info": "Set this to a larger value if you encounter OOM errors. Larger values cause slower speed.",
821
  "next_latents": "Next Latents",
822
  "generated_video": "Generated Video",
823
+ "sampling_note": "Note: The model predicts future frames from past frames. If the start action isn't immediately visible, please wait for more frames.",
824
  "error_message": "Error",
825
  "processing_error": "Processing error",
826
  "network_error": "Network connection is unstable, model download timed out. Please try again later.",
 
831
  }
832
  }
833
 
 
834
  def get_translation(key):
835
  return translations["en"].get(key, key)
836
 
837
+ #############################################
838
+ # diffusers_helper ๊ด€๋ จ ์ž„ํฌํŠธ
839
+ #############################################
840
+ from diffusers_helper.thread_utils import AsyncStream, async_run
841
+ from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
842
  from diffusers_helper.memory import (
843
  cpu,
844
  gpu,
 
850
  unload_complete_models,
851
  load_model_as_complete
852
  )
853
+ from diffusers_helper.utils import (
854
+ generate_timestamp,
855
+ save_bcthw_as_mp4,
856
+ resize_and_center_crop,
857
+ crop_or_pad_yield_mask,
858
+ soft_append_bcthw
859
+ )
860
+ from diffusers_helper.bucket_tools import find_nearest_bucket
861
+ from diffusers_helper.hunyuan import (
862
+ encode_prompt_conds, vae_encode, vae_decode, vae_decode_fake
863
  )
864
+ from diffusers_helper.clip_vision import hf_clip_vision_encode
865
+ from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
866
+ from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
867
 
868
+ from diffusers import AutoencoderKLHunyuanVideo
869
+ from transformers import (
870
+ LlamaModel, CLIPTextModel,
871
+ LlamaTokenizerFast, CLIPTokenizer,
872
+ SiglipVisionModel, SiglipImageProcessor
873
+ )
874
 
875
+ #############################################
876
+ # GPU ์ฒดํฌ
877
+ #############################################
878
+ GPU_AVAILABLE = torch.cuda.is_available()
879
+ free_mem_gb = 0.0
880
+ high_vram = False
881
+ if GPU_AVAILABLE:
 
 
 
 
 
 
 
 
882
  try:
883
+ free_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
884
+ high_vram = (free_mem_gb > 60)
885
+ except:
886
+ pass
887
+ print(f"GPU Available: {GPU_AVAILABLE}, free_mem_gb={free_mem_gb}, high_vram={high_vram}")
888
+
 
 
 
 
 
 
889
  cpu_fallback_mode = not GPU_AVAILABLE
890
+ last_update_time = time.time()
891
 
892
+ #############################################
893
+ # ๋ชจ๋ธ ๋กœ๋“œ (์ „์—ญ)
894
+ #############################################
895
+ text_encoder = None
896
+ text_encoder_2 = None
897
+ tokenizer = None
898
+ tokenizer_2 = None
899
+ vae = None
900
+ feature_extractor = None
901
+ image_encoder = None
902
+ transformer = None
903
+
904
+ # ์•„๋ž˜ ๋กœ์ง์€ ์งˆ๋ฌธ์— ์ œ์‹œ๋œ '๋‘ ๋ฒˆ์งธ ์ฝ”๋“œ'์˜ ๋ชจ๋ธ ๋กœ๋“œ ๋ถ€๋ถ„์„ ๊ฑฐ์˜ ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉ
905
+ def load_global_models():
906
+ global text_encoder, text_encoder_2, tokenizer, tokenizer_2
907
+ global vae, feature_extractor, image_encoder, transformer
908
+ global cpu_fallback_mode
909
+
910
+ # ์ด๋ฏธ ๋กœ๋“œ๋˜์—ˆ์œผ๋ฉด ํŒจ์Šค
911
+ if transformer is not None:
912
+ return
913
 
914
+ # GPU ๋ฉ”๋ชจ๋ฆฌ ์ •๋ณด
915
+ device = gpu if GPU_AVAILABLE else cpu
916
 
917
+ # diffusers_helper.memory.get_cuda_free_memory_gb(gpu)๋กœ ๋” ์ •ํ™•ํžˆ ๊ตฌํ•ด๋„ ๋จ
918
+ print("Loading models...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
919
 
920
+ # ======== ์‹ค ์ฝ”๋“œ: ๋‘ ๋ฒˆ์งธ ์˜ˆ์‹œ ๊ธฐ์ค€ =========
921
+ # (1) ํ•˜์ด๋ธŒ๋ฆฌ๋“œ (if high_vram -> GPU๋กœ ๋กœ๋“œ, ์•„๋‹ˆ๋ฉด CPU + DynamicSwap)
 
 
 
 
922
 
923
+ # ๋ฐ˜๋“œ์‹œ float16, bfloat16๋กœ ๋กœ๋“œ
924
+ text_encoder_local = LlamaModel.from_pretrained(
925
+ "hunyuanvideo-community/HunyuanVideo",
926
+ subfolder='text_encoder',
927
+ torch_dtype=torch.float16
928
+ ).cpu()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
929
 
930
+ text_encoder_2_local = CLIPTextModel.from_pretrained(
931
+ "hunyuanvideo-community/HunyuanVideo",
932
+ subfolder='text_encoder_2',
933
+ torch_dtype=torch.float16
934
+ ).cpu()
 
 
 
 
935
 
936
+ tokenizer_local = LlamaTokenizerFast.from_pretrained(
937
+ "hunyuanvideo-community/HunyuanVideo",
938
+ subfolder='tokenizer'
939
+ )
940
+ tokenizer_2_local = CLIPTokenizer.from_pretrained(
941
+ "hunyuanvideo-community/HunyuanVideo",
942
+ subfolder='tokenizer_2'
943
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
944
 
945
+ vae_local = AutoencoderKLHunyuanVideo.from_pretrained(
946
+ "hunyuanvideo-community/HunyuanVideo",
947
+ subfolder='vae',
948
+ torch_dtype=torch.float16
949
+ ).cpu()
950
 
951
+ feature_extractor_local = SiglipImageProcessor.from_pretrained(
952
+ "lllyasviel/flux_redux_bfl", subfolder='feature_extractor'
953
+ )
954
+ image_encoder_local = SiglipVisionModel.from_pretrained(
955
+ "lllyasviel/flux_redux_bfl",
956
+ subfolder='image_encoder',
957
+ torch_dtype=torch.float16
958
+ ).cpu()
959
+
960
+ # FramePack_F1_I2V_HY_20250503 (bfloat16)
961
+ transformer_local = HunyuanVideoTransformer3DModelPacked.from_pretrained(
962
+ 'lllyasviel/FramePack_F1_I2V_HY_20250503',
963
+ torch_dtype=torch.bfloat16
964
+ ).cpu()
965
+
966
+ # eval & dtype
967
+ vae_local.eval()
968
+ text_encoder_local.eval()
969
+ text_encoder_2_local.eval()
970
+ image_encoder_local.eval()
971
+ transformer_local.eval()
972
+
973
+ # VAE slicing for low VRAM
974
+ if not high_vram:
975
+ vae_local.enable_slicing()
976
+ vae_local.enable_tiling()
977
+
978
+ # ์˜คํ”„๋กœ๋“œ์šฉ
979
+ transformer_local.high_quality_fp32_output_for_inference = True
980
+ transformer_local.to(dtype=torch.bfloat16)
981
+ vae_local.to(dtype=torch.float16)
982
+ image_encoder_local.to(dtype=torch.float16)
983
+ text_encoder_local.to(dtype=torch.float16)
984
+ text_encoder_2_local.to(dtype=torch.float16)
985
+
986
+ # requires_grad_(False)
987
+ for m in [vae_local, text_encoder_local, text_encoder_2_local, image_encoder_local, transformer_local]:
988
+ m.requires_grad_(False)
989
+
990
+ # GPU ๋ชจ๋“œ & VRAM ๋งŽ์œผ๋ฉด ์ „๋ถ€ GPU
991
+ # ๊ทธ๋ ‡์ง€ ์•Š์œผ๋ฉด DynamicSwap
992
+ if GPU_AVAILABLE:
993
+ if not high_vram:
994
+ DynamicSwapInstaller.install_model(transformer_local, device=gpu)
995
+ DynamicSwapInstaller.install_model(text_encoder_local, device=gpu)
996
  else:
997
+ text_encoder_local.to(gpu)
998
+ text_encoder_2_local.to(gpu)
999
+ image_encoder_local.to(gpu)
1000
+ vae_local.to(gpu)
1001
+ transformer_local.to(gpu)
1002
  else:
1003
+ cpu_fallback_mode = True
1004
+
1005
+ # ๊ธ€๋กœ๋ฒŒ์— ํ• ๋‹น
1006
+ print("Model loaded.")
1007
+ text_encoder = text_encoder_local
1008
+ text_encoder_2 = text_encoder_2_local
1009
+ tokenizer = tokenizer_local
1010
+ tokenizer_2 = tokenizer_2_local
1011
+ vae = vae_local
1012
+ feature_extractor = feature_extractor_local
1013
+ image_encoder = image_encoder_local
1014
+ transformer = transformer_local
1015
+
1016
+ #############################################
1017
+ # Worker ๋กœ์ง (๋‘ ๋ฒˆ์งธ ์ฝ”๋“œ) ๊ทธ๋Œ€๋กœ
1018
+ #############################################
1019
+ stream = AsyncStream()
1020
+
1021
+ outputs_folder = './outputs/'
1022
+ os.makedirs(outputs_folder, exist_ok=True)
1023
 
1024
  @torch.no_grad()
1025
  def worker(
1026
+ input_image, prompt, n_prompt, seed,
1027
+ total_second_length, latent_window_size, steps,
1028
+ cfg, gs, rs, gpu_memory_preservation, use_teacache
 
 
 
 
 
 
 
 
 
1029
  ):
1030
  """
1031
+ ์‹ค์ œ ์ƒ˜ํ”Œ๋ง ๋กœ์ง (๋‘ ๋ฒˆ์งธ ์ฝ”๋“œ ๊ธฐ๋ฐ˜)
1032
  """
1033
+ load_global_models() # ๋ชจ๋ธ ๋กœ๋”ฉ
1034
+ global text_encoder, text_encoder_2, tokenizer, tokenizer_2
1035
+ global vae, feature_extractor, image_encoder, transformer
1036
  global last_update_time
 
1037
 
1038
+ # ์ตœ๋Œ€ 4์ดˆ๋กœ ๊ณ ์ •
1039
  total_second_length = min(total_second_length, 4.0)
1040
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1041
  total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
1042
  total_latent_sections = int(max(round(total_latent_sections), 1))
1043
 
1044
  job_id = generate_timestamp()
 
 
 
 
1045
 
 
1046
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
1047
 
1048
  try:
1049
+ # GPU ์ ์„ ๊ฒฝ์šฐ Unload
1050
+ if not high_vram and GPU_AVAILABLE:
1051
+ unload_complete_models(
1052
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
1053
+ )
 
1054
 
1055
+ # Text encoding
1056
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
 
1057
 
1058
+ if not high_vram and GPU_AVAILABLE:
1059
+ fake_diffusers_current_device(text_encoder, gpu)
1060
+ load_model_as_complete(text_encoder_2, target_device=gpu)
 
 
1061
 
1062
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
1063
+ if cfg == 1.0:
1064
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
1065
+ else:
1066
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
1067
+
1068
+ llama_vec, llama_mask = crop_or_pad_yield_mask(llama_vec, length=512)
1069
+ llama_vec_n, llama_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
1070
+
1071
+ # Image processing
1072
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
1073
+
1074
+ H, W, C = input_image.shape
1075
+ height, width = find_nearest_bucket(H, W, resolution=640)
1076
+
1077
+ if cpu_fallback_mode:
1078
+ height = min(height, 320)
1079
+ width = min(width, 320)
1080
+
1081
+ input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
1082
+
1083
+ Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
1084
+
1085
+ input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
1086
+ input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
1087
+
1088
+ # VAE encode
1089
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
1090
+
1091
+ if not high_vram and GPU_AVAILABLE:
1092
+ load_model_as_complete(vae, target_device=gpu)
1093
+ start_latent = vae_encode(input_image_pt, vae)
1094
+
1095
+ # CLIP Vision
1096
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
1097
+
1098
+ if not high_vram and GPU_AVAILABLE:
1099
+ load_model_as_complete(image_encoder, target_device=gpu)
1100
+ image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
1101
+ image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
1102
+
1103
+ # dtype
1104
+ llama_vec = llama_vec.to(transformer.dtype)
1105
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
1106
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
1107
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
1108
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
1109
+
1110
+ # Start sampling
1111
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1112
 
1113
  rnd = torch.Generator("cpu").manual_seed(seed)
1114
 
1115
+ # ์ดˆ๊ธฐ history latents
1116
+ history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
1117
+ history_pixels = None
1118
+
1119
+ # start_latent ๋ถ™์ด๊ธฐ
1120
+ history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
1121
+ total_generated_latent_frames = 1
 
 
 
 
 
 
 
 
 
1122
 
1123
  for section_index in range(total_latent_sections):
1124
  if stream.input_queue.top() == 'end':
 
 
 
 
 
 
 
 
 
 
1125
  stream.output_queue.push(('end', None))
1126
  return
1127
 
1128
+ print(f'Section {section_index+1}/{total_latent_sections}')
1129
+
1130
+ if not high_vram and GPU_AVAILABLE:
1131
+ unload_complete_models()
1132
+ move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
1133
+
1134
+ # teacache
1135
+ if use_teacache:
1136
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
 
 
 
 
 
 
 
 
 
1137
  else:
1138
  transformer.initialize_teacache(enable_teacache=False)
1139
 
 
1140
  def callback(d):
1141
+ preview = d['denoised']
1142
+ preview = vae_decode_fake(preview)
1143
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
1144
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
1145
+
1146
+ if stream.input_queue.top() == 'end':
1147
+ stream.output_queue.push(('end', None))
1148
+ raise KeyboardInterrupt('User stops generation.')
1149
+
1150
+ current_step = d['i'] + 1
1151
+ percentage = int(100.0 * current_step / steps)
1152
+ hint = f'Sampling {current_step}/{steps}'
1153
+ desc = f'Section {section_index+1}/{total_latent_sections}'
1154
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
 
 
 
 
 
 
 
1155
  return
1156
 
1157
+ # indices
1158
+ frames_per_section = latent_window_size * 4 - 3
1159
+ indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
1160
+ (
1161
+ clean_latent_indices_start,
1162
+ clean_latent_4x_indices,
1163
+ clean_latent_2x_indices,
1164
+ clean_latent_1x_indices,
1165
+ latent_indices
1166
+ ) = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
1167
+
1168
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
1169
+
1170
+ clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -19:, :, :].split([16, 2, 1], dim=2)
1171
+ clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1172
 
 
1173
  try:
1174
  generated_latents = sample_hunyuan(
1175
  transformer=transformer,
 
1183
  num_inference_steps=steps,
1184
  generator=rnd,
1185
  prompt_embeds=llama_vec,
1186
+ prompt_embeds_mask=llama_mask,
1187
  prompt_poolers=clip_l_pooler,
1188
  negative_prompt_embeds=llama_vec_n,
1189
+ negative_prompt_embeds_mask=llama_mask_n,
1190
  negative_prompt_poolers=clip_l_pooler_n,
1191
+ device=gpu if GPU_AVAILABLE else cpu,
1192
+ dtype=torch.bfloat16,
1193
  image_embeddings=image_encoder_last_hidden_state,
1194
  latent_indices=latent_indices,
1195
  clean_latents=clean_latents,
1196
+ clean_latent_indices=clean_latent_indices,
1197
  clean_latents_2x=clean_latents_2x,
1198
  clean_latent_2x_indices=clean_latent_2x_indices,
1199
  clean_latents_4x=clean_latents_4x,
 
1201
  callback=callback
1202
  )
1203
  except KeyboardInterrupt:
1204
+ print("User cancelled.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1205
  stream.output_queue.push(('end', None))
1206
  return
 
 
 
 
 
1207
  except Exception as e:
 
 
1208
  traceback.print_exc()
 
1209
  stream.output_queue.push(('end', None))
1210
  return
1211
 
1212
+ total_generated_latent_frames += generated_latents.shape[2]
1213
+ history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
 
 
 
 
 
1214
 
1215
+ if not high_vram and GPU_AVAILABLE:
1216
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
1217
+ load_model_as_complete(vae, target_device=gpu)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1218
 
1219
+ real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
 
 
 
1220
 
1221
+ if history_pixels is None:
1222
+ history_pixels = vae_decode(real_history_latents, vae).cpu()
1223
+ else:
1224
+ section_latent_frames = latent_window_size * 2
1225
+ overlapped_frames = frames_per_section
1226
+ current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
1227
+ history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
1228
 
1229
+ if not high_vram and GPU_AVAILABLE:
1230
+ unload_complete_models()
1231
 
1232
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
1233
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=16) # CRF=16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1234
 
1235
+ stream.output_queue.push(('file', output_filename))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1236
 
1237
+ except:
1238
+ traceback.print_exc()
1239
+ if not high_vram and GPU_AVAILABLE:
1240
+ unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1241
 
1242
+ stream.output_queue.push(('end', None))
1243
+ return
1244
 
1245
  def end_process():
1246
  """
1247
+ ์ค‘๋‹จ ์š”์ฒญ
1248
  """
 
1249
  global stream
1250
+ stream.input_queue.push('end')
 
 
 
 
 
 
 
 
 
 
 
 
 
1251
 
1252
+ # Gradio์—์„œ ์ด worker ํ•จ์ˆ˜๋ฅผ ๋น„๋™๊ธฐ๋กœ ํ˜ธ์ถœ
1253
+ def process(
1254
+ input_image, prompt, n_prompt, seed,
1255
+ total_second_length, latent_window_size, steps,
1256
+ cfg, gs, rs, gpu_memory_preservation, use_teacache
1257
+ ):
1258
+ global stream
1259
+ if input_image is None:
1260
+ raise ValueError("No input image provided.")
1261
+
1262
+ yield None, None, "", "", gr.update(interactive=False), gr.update(interactive=True)
1263
+
1264
+ stream = AsyncStream()
1265
+ async_run(
1266
+ worker,
1267
+ input_image, prompt, n_prompt, seed,
1268
+ total_second_length, latent_window_size, steps,
1269
+ cfg, gs, rs, gpu_memory_preservation, use_teacache
1270
+ )
1271
+
1272
+ output_filename = None
1273
+ prev_filename = None
1274
+ error_message = None
1275
+
1276
+ while True:
1277
+ flag, data = stream.output_queue.next()
1278
+ if flag == 'file':
1279
+ output_filename = data
1280
+ prev_filename = output_filename
1281
+ yield output_filename, gr.update(), gr.update(), "", gr.update(interactive=False), gr.update(interactive=True)
1282
+
1283
+ elif flag == 'progress':
1284
+ preview, desc, html = data
1285
+ yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
1286
+
1287
+ elif flag == 'error':
1288
+ error_message = data
1289
+ print(f"Error: {error_message}")
1290
+
1291
+ elif flag == 'end':
1292
+ if output_filename is None and prev_filename:
1293
+ output_filename = prev_filename
1294
+ # ์—๋Ÿฌ๊ฐ€ ์žˆ์—ˆ์œผ๋ฉด ์—๋Ÿฌ ํ‘œ์‹œ
1295
+ if error_message:
1296
+ yield (
1297
+ output_filename, # ๋งˆ์ง€๋ง‰ ํŒŒ์ผ (๋˜๋Š” None)
1298
+ gr.update(visible=False),
1299
+ gr.update(),
1300
+ f"<div style='color:red;'>{error_message}</div>",
1301
+ gr.update(interactive=True),
1302
+ gr.update(interactive=False)
1303
+ )
1304
+ else:
1305
+ yield (
1306
+ output_filename, gr.update(visible=False), gr.update(), "", gr.update(interactive=True), gr.update(interactive=False)
1307
+ )
1308
+ break
1309
 
1310
+ # UI CSS
1311
  def make_custom_css():
1312
  base_progress_css = make_progress_bar_css()
1313
  pastel_css = """
 
1314
  body {
1315
  background: #faf9ff !important;
1316
  font-family: "Noto Sans", sans-serif;
 
1371
  margin-top: 10px;
1372
  font-weight: 500;
1373
  }
 
 
 
 
 
 
 
 
 
 
 
1374
  @media (max-width: 768px) {
1375
  #app-container {
1376
  padding: 0.5rem;
 
1387
 
1388
  css = make_custom_css()
1389
 
1390
+ # ์ƒ˜ํ”Œ ํ”„๋กฌํ”„ํŠธ
1391
+ quick_prompts = [
1392
+ ["The girl dances gracefully, with clear movements, full of charm."],
1393
+ ["A character doing some simple body movements."]
1394
+ ]
1395
+
1396
  # Gradio UI
1397
  block = gr.Blocks(css=css).queue()
1398
  with block:
 
1399
  gr.HTML("<div id='app-container'><h1>FramePack - Image to Video Generation</h1></div>")
1400
 
1401
  with gr.Row(elem_classes="mobile-full-width"):
1402
+ # ์™ผ์ชฝ
1403
  with gr.Column(scale=1, elem_classes="gr-panel"):
1404
  input_image = gr.Image(
1405
  label=get_translation("upload_image"),
 
1406
  type="numpy",
 
1407
  height=320
1408
  )
1409
+ prompt = gr.Textbox(
1410
+ label=get_translation("prompt"),
1411
+ value=''
1412
+ )
1413
 
1414
  example_quick_prompts = gr.Dataset(
1415
  samples=quick_prompts,
 
1424
  show_progress=False,
1425
  queue=False
1426
  )
1427
+
1428
+ # ์˜ค๋ฅธ์ชฝ
1429
  with gr.Column(scale=1, elem_classes="gr-panel"):
1430
  with gr.Row(elem_classes="button-container"):
1431
  start_button = gr.Button(
 
1433
  elem_id="start-button",
1434
  variant="primary"
1435
  )
1436
+ stop_button = gr.Button(
1437
  value=get_translation("stop_generation"),
1438
  elem_id="stop-button",
1439
  interactive=False
1440
  )
1441
+
1442
  result_video = gr.Video(
1443
  label=get_translation("generated_video"),
1444
  autoplay=True,
1445
  loop=True,
1446
  height=320,
1447
+ elem_classes="video-container"
 
1448
  )
1449
  preview_image = gr.Image(
1450
  label=get_translation("next_latents"),
 
1452
  height=150,
1453
  elem_classes="preview-container"
1454
  )
 
1455
  gr.Markdown(get_translation("sampling_note"))
1456
+
1457
  with gr.Group(elem_classes="progress-container"):
1458
  progress_desc = gr.Markdown('')
1459
  progress_bar = gr.HTML('')
 
 
1460
 
1461
+ error_message = gr.HTML('', visible=True)
1462
+
1463
+ # Advanced
1464
  with gr.Accordion("Advanced Settings", open=False, elem_classes="gr-panel"):
1465
  use_teacache = gr.Checkbox(
1466
  label=get_translation("use_teacache"),
 
1473
  value=31337,
1474
  precision=0
1475
  )
1476
+ # ๊ธฐ๋ณธ 2์ดˆ, ์ตœ๋Œ€ 4์ดˆ
1477
  total_second_length = gr.Slider(
1478
  label=get_translation("video_length"),
1479
  minimum=1,
 
1530
  info=get_translation("gpu_memory_info")
1531
  )
1532
 
1533
+ # ๋ฒ„ํŠผ ์ฒ˜๋ฆฌ
1534
+ inputs_list = [
1535
  input_image, prompt, n_prompt, seed,
1536
  total_second_length, latent_window_size, steps,
1537
  cfg, gs, rs, gpu_memory_preservation, use_teacache
1538
  ]
1539
  start_button.click(
1540
  fn=process,
1541
+ inputs=inputs_list,
1542
+ outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, stop_button]
1543
  )
1544
+ stop_button.click(fn=end_process)
1545
 
1546
  block.launch()