ginipick commited on
Commit
2b46b9d
·
verified ·
1 Parent(s): 7370131

Update demo_gradio.py

Browse files
Files changed (1) hide show
  1. demo_gradio.py +256 -233
demo_gradio.py CHANGED
@@ -1,42 +1,56 @@
1
- from diffusers_helper.hf_login import login
2
 
3
  import os
 
 
4
 
5
- os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6
 
7
- import gradio as gr
8
  import torch
9
- import traceback
10
- import einops
11
- import safetensors.torch as sf
12
  import numpy as np
13
- import argparse
14
- import math
15
 
16
  from PIL import Image
17
  from diffusers import AutoencoderKLHunyuanVideo
18
- from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
19
- from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
20
- from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
 
 
 
 
 
 
 
 
 
 
 
 
21
  from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
22
  from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
23
- from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
24
- from diffusers_helper.thread_utils import AsyncStream, async_run
25
- from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
26
- from transformers import SiglipImageProcessor, SiglipVisionModel
 
27
  from diffusers_helper.clip_vision import hf_clip_vision_encode
28
- from diffusers_helper.bucket_tools import find_nearest_bucket
29
 
30
 
 
31
  parser = argparse.ArgumentParser()
32
  parser.add_argument('--share', action='store_true')
33
- parser.add_argument("--server", type=str, default='0.0.0.0')
34
- parser.add_argument("--port", type=int, required=False)
35
- parser.add_argument("--inbrowser", action='store_true')
36
  args = parser.parse_args()
37
 
38
- # for win desktop probably use --server 127.0.0.1 --inbrowser
39
- # For linux server probably use --server 127.0.0.1 or do not use any cmd flags
 
40
 
41
  print(args)
42
 
@@ -46,6 +60,7 @@ high_vram = free_mem_gb > 60
46
  print(f'Free VRAM {free_mem_gb} GB')
47
  print(f'High-VRAM Mode: {high_vram}')
48
 
 
49
  text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
50
  text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
51
  tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
@@ -57,18 +72,18 @@ image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", s
57
 
58
  transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePackI2V_HY', torch_dtype=torch.bfloat16).cpu()
59
 
60
- vae.eval()
61
- text_encoder.eval()
62
- text_encoder_2.eval()
63
- image_encoder.eval()
64
- transformer.eval()
65
 
66
  if not high_vram:
67
  vae.enable_slicing()
68
  vae.enable_tiling()
69
 
70
  transformer.high_quality_fp32_output_for_inference = True
71
- print('transformer.high_quality_fp32_output_for_inference = True')
72
 
73
  transformer.to(dtype=torch.bfloat16)
74
  vae.to(dtype=torch.float16)
@@ -76,21 +91,17 @@ image_encoder.to(dtype=torch.float16)
76
  text_encoder.to(dtype=torch.float16)
77
  text_encoder_2.to(dtype=torch.float16)
78
 
79
- vae.requires_grad_(False)
80
- text_encoder.requires_grad_(False)
81
- text_encoder_2.requires_grad_(False)
82
- image_encoder.requires_grad_(False)
83
- transformer.requires_grad_(False)
84
 
85
  if not high_vram:
86
- # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
87
  DynamicSwapInstaller.install_model(transformer, device=gpu)
88
  DynamicSwapInstaller.install_model(text_encoder, device=gpu)
89
  else:
90
- text_encoder.to(gpu)
91
- text_encoder_2.to(gpu)
92
- image_encoder.to(gpu)
93
- vae.to(gpu)
94
  transformer.to(gpu)
95
 
96
  stream = AsyncStream()
@@ -98,221 +109,243 @@ stream = AsyncStream()
98
  outputs_folder = './outputs/'
99
  os.makedirs(outputs_folder, exist_ok=True)
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
- @torch.no_grad()
103
- def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache):
104
- total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
105
- total_latent_sections = int(max(round(total_latent_sections), 1))
106
 
107
- job_id = generate_timestamp()
108
 
109
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
110
 
111
- try:
112
- # Clean GPU
113
- if not high_vram:
114
- unload_complete_models(
115
- text_encoder, text_encoder_2, image_encoder, vae, transformer
116
- )
117
 
118
- # Text encoding
119
 
120
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
121
 
122
- if not high_vram:
123
- fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
124
- load_model_as_complete(text_encoder_2, target_device=gpu)
125
 
126
- llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
127
 
128
- if cfg == 1:
129
- llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
130
- else:
131
- llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
132
 
133
- llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
134
- llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
135
 
136
- # Processing input image
137
 
138
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
139
 
140
- H, W, C = input_image.shape
141
- height, width = find_nearest_bucket(H, W, resolution=640)
142
- input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
143
 
144
- Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
145
 
146
- input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
147
- input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
148
 
149
- # VAE encoding
150
 
151
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
152
 
153
- if not high_vram:
154
- load_model_as_complete(vae, target_device=gpu)
155
 
156
- start_latent = vae_encode(input_image_pt, vae)
157
 
158
- # CLIP Vision
159
 
160
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
161
 
162
- if not high_vram:
163
- load_model_as_complete(image_encoder, target_device=gpu)
164
 
165
- image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
166
- image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
167
 
168
- # Dtype
169
 
170
- llama_vec = llama_vec.to(transformer.dtype)
171
- llama_vec_n = llama_vec_n.to(transformer.dtype)
172
- clip_l_pooler = clip_l_pooler.to(transformer.dtype)
173
- clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
174
- image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
175
 
176
- # Sampling
177
 
178
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
179
 
180
- rnd = torch.Generator("cpu").manual_seed(seed)
181
- num_frames = latent_window_size * 4 - 3
182
 
183
- history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32).cpu()
184
- history_pixels = None
185
- total_generated_latent_frames = 0
186
 
187
- latent_paddings = reversed(range(total_latent_sections))
188
 
189
- if total_latent_sections > 4:
190
- # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some
191
- # items looks better than expanding it when total_latent_sections > 4
192
- # One can try to remove below trick and just
193
- # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
194
- latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
195
 
196
- for latent_padding in latent_paddings:
197
- is_last_section = latent_padding == 0
198
- latent_padding_size = latent_padding * latent_window_size
199
 
200
- if stream.input_queue.top() == 'end':
201
- stream.output_queue.push(('end', None))
202
- return
203
 
204
- print(f'latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}')
205
 
206
- indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
207
- clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
208
- clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
209
 
210
- clean_latents_pre = start_latent.to(history_latents)
211
- clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
212
- clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
213
 
214
- if not high_vram:
215
- unload_complete_models()
216
- move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
217
 
218
- if use_teacache:
219
- transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
220
- else:
221
- transformer.initialize_teacache(enable_teacache=False)
222
 
223
- def callback(d):
224
- preview = d['denoised']
225
- preview = vae_decode_fake(preview)
226
 
227
- preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
228
- preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
229
 
230
- if stream.input_queue.top() == 'end':
231
- stream.output_queue.push(('end', None))
232
- raise KeyboardInterrupt('User ends the task.')
233
 
234
- current_step = d['i'] + 1
235
- percentage = int(100.0 * current_step / steps)
236
- hint = f'Sampling {current_step}/{steps}'
237
- desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
238
- stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
239
- return
240
 
241
- generated_latents = sample_hunyuan(
242
- transformer=transformer,
243
- sampler='unipc',
244
- width=width,
245
- height=height,
246
- frames=num_frames,
247
- real_guidance_scale=cfg,
248
- distilled_guidance_scale=gs,
249
- guidance_rescale=rs,
250
- # shift=3.0,
251
- num_inference_steps=steps,
252
- generator=rnd,
253
- prompt_embeds=llama_vec,
254
- prompt_embeds_mask=llama_attention_mask,
255
- prompt_poolers=clip_l_pooler,
256
- negative_prompt_embeds=llama_vec_n,
257
- negative_prompt_embeds_mask=llama_attention_mask_n,
258
- negative_prompt_poolers=clip_l_pooler_n,
259
- device=gpu,
260
- dtype=torch.bfloat16,
261
- image_embeddings=image_encoder_last_hidden_state,
262
- latent_indices=latent_indices,
263
- clean_latents=clean_latents,
264
- clean_latent_indices=clean_latent_indices,
265
- clean_latents_2x=clean_latents_2x,
266
- clean_latent_2x_indices=clean_latent_2x_indices,
267
- clean_latents_4x=clean_latents_4x,
268
- clean_latent_4x_indices=clean_latent_4x_indices,
269
- callback=callback,
270
- )
271
-
272
- if is_last_section:
273
- generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
274
-
275
- total_generated_latent_frames += int(generated_latents.shape[2])
276
- history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
277
-
278
- if not high_vram:
279
- offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
280
- load_model_as_complete(vae, target_device=gpu)
281
-
282
- real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
283
-
284
- if history_pixels is None:
285
- history_pixels = vae_decode(real_history_latents, vae).cpu()
286
- else:
287
- section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
288
- overlapped_frames = latent_window_size * 4 - 3
289
-
290
- current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
291
- history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
292
-
293
- if not high_vram:
294
- unload_complete_models()
295
-
296
- output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
297
-
298
- save_bcthw_as_mp4(history_pixels, output_filename, fps=30)
299
-
300
- print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
301
-
302
- stream.output_queue.push(('file', output_filename))
303
-
304
- if is_last_section:
305
- break
306
- except:
307
- traceback.print_exc()
308
-
309
- if not high_vram:
310
- unload_complete_models(
311
- text_encoder, text_encoder_2, image_encoder, vae, transformer
312
- )
313
-
314
- stream.output_queue.push(('end', None))
315
- return
316
 
317
 
318
  def process(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache):
@@ -343,8 +376,8 @@ def process(input_image, prompt, n_prompt, seed, total_second_length, latent_win
343
  break
344
 
345
 
346
- def end_process():
347
- stream.input_queue.push('end')
348
 
349
 
350
  quick_prompts = [
@@ -355,42 +388,35 @@ quick_prompts = [[x] for x in quick_prompts]
355
 
356
 
357
  css = make_progress_bar_css()
 
358
  block = gr.Blocks(css=css).queue()
359
  with block:
360
  gr.Markdown('# FramePack')
361
- with gr.Row():
362
- with gr.Column():
363
- input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
364
- prompt = gr.Textbox(label="Prompt", value='')
365
- example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Quick List', samples_per_page=1000, components=[prompt])
366
- example_quick_prompts.click(lambda x: x[0], inputs=[example_quick_prompts], outputs=prompt, show_progress=False, queue=False)
367
-
368
- with gr.Row():
369
- start_button = gr.Button(value="Start Generation")
370
  end_button = gr.Button(value="End Generation", interactive=False)
371
 
372
  with gr.Group():
373
- use_teacache = gr.Checkbox(label='Use TeaCache', value=True, info='Faster speed, but often makes hands and fingers slightly worse.')
 
374
 
375
- n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False) # Not used
376
  seed = gr.Number(label="Seed", value=31337, precision=0)
377
 
378
  total_second_length = gr.Slider(label="Total Video Length (Seconds)", minimum=1, maximum=120, value=5, step=0.1)
379
- latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, visible=False) # Should not change
380
- steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Changing this value is not recommended.')
 
 
 
 
381
 
382
- cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=False) # Should not change
383
- gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Changing this value is not recommended.')
384
- rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change
385
 
386
- gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
387
 
388
  with gr.Column():
389
  preview_image = gr.Image(label="Next Latents", height=200, visible=False)
390
  result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
391
- gr.Markdown('Note that the ending actions will be generated before the starting actions due to the inverted sampling. If the starting action is not in the video, you just need to wait, and it will be generated later.')
392
  progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
393
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
 
394
  ips = [input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache]
395
  start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
396
  end_button.click(fn=end_process)
@@ -398,7 +424,4 @@ with block:
398
 
399
  block.launch(
400
  server_name=args.server,
401
- server_port=args.port,
402
- share=args.share,
403
- inbrowser=args.inbrowser,
404
- )
 
 
1
 
2
  import os
3
+ import argparse
4
+
5
 
 
6
 
 
7
  import torch
8
+ import gradio as gr
9
+
10
+
11
  import numpy as np
12
+ import einops
13
+ import traceback
14
 
15
  from PIL import Image
16
  from diffusers import AutoencoderKLHunyuanVideo
17
+ from transformers import (
18
+ LlamaModel, CLIPTextModel,
19
+ LlamaTokenizerFast, CLIPTokenizer,
20
+ SiglipImageProcessor, SiglipVisionModel
21
+ )
22
+
23
+ from diffusers_helper.hf_login import login
24
+ from diffusers_helper.hunyuan import (
25
+ encode_prompt_conds, vae_decode, vae_encode,
26
+ vae_decode_fake
27
+ )
28
+ from diffusers_helper.utils import (
29
+ save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw,
30
+ resize_and_center_crop, generate_timestamp
31
+ )
32
  from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
33
  from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
34
+ from diffusers_helper.memory import (
35
+ gpu, get_cuda_free_memory_gb, unload_complete_models, load_model_as_complete,
36
+ DynamicSwapInstaller, move_model_to_device_with_memory_preservation,
37
+ offload_model_from_device_for_memory_preservation, fake_diffusers_current_device
38
+ )
39
  from diffusers_helper.clip_vision import hf_clip_vision_encode
40
+ from diffusers_helper.thread_utils import AsyncStream, async_run
41
 
42
 
43
+ # --- Args and config ---
44
  parser = argparse.ArgumentParser()
45
  parser.add_argument('--share', action='store_true')
46
+ parser.add_argument('--server', type=str, default='0.0.0.0')
47
+ parser.add_argument('--port', type=int, required=False)
48
+ parser.add_argument('--inbrowser', action='store_true')
49
  args = parser.parse_args()
50
 
51
+ os.environ['HF_HOME'] = os.path.abspath(
52
+ os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download'))
53
+ )
54
 
55
  print(args)
56
 
 
60
  print(f'Free VRAM {free_mem_gb} GB')
61
  print(f'High-VRAM Mode: {high_vram}')
62
 
63
+ # --- Load models ---
64
  text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
65
  text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
66
  tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
 
72
 
73
  transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePackI2V_HY', torch_dtype=torch.bfloat16).cpu()
74
 
75
+ vae.eval(), text_encoder.eval(), text_encoder_2.eval(), image_encoder.eval(), transformer.eval()
76
+
77
+
78
+
79
+
80
 
81
  if not high_vram:
82
  vae.enable_slicing()
83
  vae.enable_tiling()
84
 
85
  transformer.high_quality_fp32_output_for_inference = True
86
+
87
 
88
  transformer.to(dtype=torch.bfloat16)
89
  vae.to(dtype=torch.float16)
 
91
  text_encoder.to(dtype=torch.float16)
92
  text_encoder_2.to(dtype=torch.float16)
93
 
94
+ for model in [vae, text_encoder, text_encoder_2, image_encoder, transformer]:
95
+ model.requires_grad_(False)
96
+
97
+
98
+
99
 
100
  if not high_vram:
101
+
102
  DynamicSwapInstaller.install_model(transformer, device=gpu)
103
  DynamicSwapInstaller.install_model(text_encoder, device=gpu)
104
  else:
 
 
 
 
105
  transformer.to(gpu)
106
 
107
  stream = AsyncStream()
 
109
  outputs_folder = './outputs/'
110
  os.makedirs(outputs_folder, exist_ok=True)
111
 
112
+ # --- UI + CSS ---
113
+ def make_progress_bar_css():
114
+ return """
115
+ body, .gradio-container {
116
+ background-color: #000000 !important;
117
+ color: #FFFFFF !important;
118
+ }
119
+ .gr-button, .gr-input, .gr-textbox, .gr-slider, .gr-checkbox {
120
+ background-color: #1a1a1a !important;
121
+ color: #ffffff !important;
122
+ border-color: #444 !important;
123
+ }
124
+ .gr-button:hover {
125
+ background-color: #333 !important;
126
+ }
127
+ .gr-markdown {
128
+ color: #ddd !important;
129
+ }
130
+ .gr-image-preview, .gr-video {
131
+ background-color: #111 !important;
132
+ }
133
+ """
134
+
135
+ def end_process():
136
+ stream.input_queue.push('end')
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+
167
+
168
+
169
+
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
+
190
+
191
+
192
+
193
+
194
+
195
+
196
+
197
+
198
+
199
+
200
+
201
+
202
+
203
+
204
+
205
+
206
+
207
+
208
+
209
+
210
+
211
+
212
+
213
+
214
+
215
+
216
+
217
+
218
+
219
+
220
+
221
+
222
+
223
+
224
+
225
+
226
+
227
+
228
+
229
+
230
+
231
+
232
+
233
+
234
+
235
+
236
+
237
+
238
+
239
+
240
+
241
+
242
+
243
+
244
+
245
+
246
+
247
+
248
+
249
+
250
+
251
+
252
+
253
+
254
+
255
+
256
+
257
+
258
+
259
+
260
+
261
+
262
+
263
+
264
+
265
+
266
+
267
+
268
+
269
+
270
+
271
+
272
+
273
+
274
+
275
+
276
+
277
+
278
+
279
+
280
+
281
+
282
+
283
+
284
+
285
+
286
+
287
+
288
+
289
+
290
+
291
+
292
+
293
+
294
+
295
+
296
+
297
+
298
+
299
+
300
+
301
+
302
+
303
+
304
+
305
+
306
 
 
 
 
 
307
 
 
308
 
 
309
 
 
 
 
 
 
 
310
 
 
311
 
 
312
 
 
 
 
313
 
 
314
 
 
 
 
 
315
 
 
 
316
 
 
317
 
 
318
 
 
 
 
319
 
 
320
 
 
 
321
 
 
322
 
 
323
 
 
 
324
 
 
325
 
 
326
 
 
327
 
 
 
328
 
 
 
329
 
 
330
 
 
 
 
 
 
331
 
 
332
 
 
333
 
 
 
334
 
 
 
 
335
 
 
336
 
 
 
 
 
 
 
337
 
 
 
 
338
 
 
 
 
339
 
 
340
 
 
 
 
341
 
 
 
 
342
 
 
 
 
343
 
 
 
 
 
344
 
 
 
 
345
 
 
 
346
 
 
 
 
347
 
 
 
 
 
 
 
348
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
 
351
  def process(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache):
 
376
  break
377
 
378
 
379
+
380
+
381
 
382
 
383
  quick_prompts = [
 
388
 
389
 
390
  css = make_progress_bar_css()
391
+
392
  block = gr.Blocks(css=css).queue()
393
  with block:
394
  gr.Markdown('# FramePack')
 
 
 
 
 
 
 
 
 
395
  end_button = gr.Button(value="End Generation", interactive=False)
396
 
397
  with gr.Group():
398
+ use_teacache = gr.Checkbox(label='Use TeaCache', value=True)
399
+ n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False)
400
 
 
401
  seed = gr.Number(label="Seed", value=31337, precision=0)
402
 
403
  total_second_length = gr.Slider(label="Total Video Length (Seconds)", minimum=1, maximum=120, value=5, step=0.1)
404
+ latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, visible=False)
405
+ steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1)
406
+ cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=False)
407
+ gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01)
408
+ rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False)
409
+ gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB)", minimum=6, maximum=128, value=6, step=0.1)
410
 
 
 
 
411
 
 
412
 
413
  with gr.Column():
414
  preview_image = gr.Image(label="Next Latents", height=200, visible=False)
415
  result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
416
+ gr.Markdown('Note: The ending actions are generated before the start. Wait for full video.')
417
  progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
418
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
419
+
420
  ips = [input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache]
421
  start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
422
  end_button.click(fn=end_process)
 
424
 
425
  block.launch(
426
  server_name=args.server,
427
+ server_port=args.port,