Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -14,7 +14,7 @@ from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode
|
|
14 |
from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, generate_timestamp
|
15 |
from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
|
16 |
from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
|
17 |
-
from diffusers_helper.memory import cpu, gpu,
|
18 |
from diffusers_helper.thread_utils import AsyncStream, async_run
|
19 |
from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
|
20 |
from transformers import SiglipImageProcessor, SiglipVisionModel
|
@@ -22,12 +22,6 @@ from diffusers_helper.clip_vision import hf_clip_vision_encode
|
|
22 |
from diffusers_helper.bucket_tools import find_nearest_bucket
|
23 |
|
24 |
|
25 |
-
free_mem_gb = get_cuda_free_memory_gb(gpu)
|
26 |
-
high_vram = free_mem_gb > 60
|
27 |
-
|
28 |
-
print(f'Free VRAM {free_mem_gb} GB')
|
29 |
-
print(f'High-VRAM Mode: {high_vram}')
|
30 |
-
|
31 |
text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
|
32 |
text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
|
33 |
tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
|
@@ -45,9 +39,8 @@ text_encoder_2.eval()
|
|
45 |
image_encoder.eval()
|
46 |
transformer.eval()
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
vae.enable_tiling()
|
51 |
|
52 |
transformer.high_quality_fp32_output_for_inference = True
|
53 |
print('transformer.high_quality_fp32_output_for_inference = True')
|
@@ -64,16 +57,8 @@ text_encoder_2.requires_grad_(False)
|
|
64 |
image_encoder.requires_grad_(False)
|
65 |
transformer.requires_grad_(False)
|
66 |
|
67 |
-
|
68 |
-
|
69 |
-
DynamicSwapInstaller.install_model(transformer, device=gpu)
|
70 |
-
DynamicSwapInstaller.install_model(text_encoder, device=gpu)
|
71 |
-
else:
|
72 |
-
text_encoder.to(gpu)
|
73 |
-
text_encoder_2.to(gpu)
|
74 |
-
image_encoder.to(gpu)
|
75 |
-
vae.to(gpu)
|
76 |
-
transformer.to(gpu)
|
77 |
|
78 |
stream = AsyncStream()
|
79 |
|
@@ -91,19 +76,16 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
91 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
|
92 |
|
93 |
try:
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
text_encoder, text_encoder_2, image_encoder, vae, transformer
|
98 |
-
)
|
99 |
|
100 |
# Text encoding
|
101 |
|
102 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
|
103 |
|
104 |
-
|
105 |
-
|
106 |
-
load_model_as_complete(text_encoder_2, target_device=gpu)
|
107 |
|
108 |
llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
|
109 |
|
@@ -132,8 +114,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
132 |
|
133 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
|
134 |
|
135 |
-
|
136 |
-
load_model_as_complete(vae, target_device=gpu)
|
137 |
|
138 |
start_latent = vae_encode(input_image_pt, vae)
|
139 |
|
@@ -141,8 +122,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
141 |
|
142 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
|
143 |
|
144 |
-
|
145 |
-
load_model_as_complete(image_encoder, target_device=gpu)
|
146 |
|
147 |
image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
|
148 |
image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
|
@@ -193,9 +173,8 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
193 |
clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
|
194 |
clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
|
195 |
|
196 |
-
|
197 |
-
|
198 |
-
move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
|
199 |
|
200 |
if use_teacache:
|
201 |
transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
|
@@ -257,9 +236,8 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
257 |
total_generated_latent_frames += int(generated_latents.shape[2])
|
258 |
history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
|
259 |
|
260 |
-
|
261 |
-
|
262 |
-
load_model_as_complete(vae, target_device=gpu)
|
263 |
|
264 |
real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
|
265 |
|
@@ -272,8 +250,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
272 |
current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
|
273 |
history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
|
274 |
|
275 |
-
|
276 |
-
unload_complete_models()
|
277 |
|
278 |
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
279 |
|
@@ -288,10 +265,9 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
288 |
except:
|
289 |
traceback.print_exc()
|
290 |
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
)
|
295 |
|
296 |
stream.output_queue.push(('end', None))
|
297 |
return
|
|
|
14 |
from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, generate_timestamp
|
15 |
from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
|
16 |
from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
|
17 |
+
from diffusers_helper.memory import cpu, gpu, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
|
18 |
from diffusers_helper.thread_utils import AsyncStream, async_run
|
19 |
from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
|
20 |
from transformers import SiglipImageProcessor, SiglipVisionModel
|
|
|
22 |
from diffusers_helper.bucket_tools import find_nearest_bucket
|
23 |
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
|
26 |
text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
|
27 |
tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
|
|
|
39 |
image_encoder.eval()
|
40 |
transformer.eval()
|
41 |
|
42 |
+
vae.enable_slicing()
|
43 |
+
vae.enable_tiling()
|
|
|
44 |
|
45 |
transformer.high_quality_fp32_output_for_inference = True
|
46 |
print('transformer.high_quality_fp32_output_for_inference = True')
|
|
|
57 |
image_encoder.requires_grad_(False)
|
58 |
transformer.requires_grad_(False)
|
59 |
|
60 |
+
DynamicSwapInstaller.install_model(transformer, device=gpu)
|
61 |
+
DynamicSwapInstaller.install_model(text_encoder, device=gpu)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
stream = AsyncStream()
|
64 |
|
|
|
76 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
|
77 |
|
78 |
try:
|
79 |
+
unload_complete_models(
|
80 |
+
text_encoder, text_encoder_2, image_encoder, vae, transformer
|
81 |
+
)
|
|
|
|
|
82 |
|
83 |
# Text encoding
|
84 |
|
85 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
|
86 |
|
87 |
+
fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
|
88 |
+
load_model_as_complete(text_encoder_2, target_device=gpu)
|
|
|
89 |
|
90 |
llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
|
91 |
|
|
|
114 |
|
115 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
|
116 |
|
117 |
+
load_model_as_complete(vae, target_device=gpu)
|
|
|
118 |
|
119 |
start_latent = vae_encode(input_image_pt, vae)
|
120 |
|
|
|
122 |
|
123 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
|
124 |
|
125 |
+
load_model_as_complete(image_encoder, target_device=gpu)
|
|
|
126 |
|
127 |
image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
|
128 |
image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
|
|
|
173 |
clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
|
174 |
clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
|
175 |
|
176 |
+
unload_complete_models()
|
177 |
+
move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
|
|
|
178 |
|
179 |
if use_teacache:
|
180 |
transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
|
|
|
236 |
total_generated_latent_frames += int(generated_latents.shape[2])
|
237 |
history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
|
238 |
|
239 |
+
offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
|
240 |
+
load_model_as_complete(vae, target_device=gpu)
|
|
|
241 |
|
242 |
real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
|
243 |
|
|
|
250 |
current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
|
251 |
history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
|
252 |
|
253 |
+
unload_complete_models()
|
|
|
254 |
|
255 |
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
256 |
|
|
|
265 |
except:
|
266 |
traceback.print_exc()
|
267 |
|
268 |
+
unload_complete_models(
|
269 |
+
text_encoder, text_encoder_2, image_encoder, vae, transformer
|
270 |
+
)
|
|
|
271 |
|
272 |
stream.output_queue.push(('end', None))
|
273 |
return
|