Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -56,8 +56,14 @@ text_encoder_2.requires_grad_(False)
|
|
56 |
image_encoder.requires_grad_(False)
|
57 |
transformer.requires_grad_(False)
|
58 |
|
59 |
-
DynamicSwapInstaller.install_model(transformer, device=gpu)
|
60 |
-
DynamicSwapInstaller.install_model(text_encoder, device=gpu)
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
stream = AsyncStream()
|
63 |
|
@@ -75,16 +81,16 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
75 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
|
76 |
|
77 |
try:
|
78 |
-
unload_complete_models(
|
79 |
-
|
80 |
-
)
|
81 |
|
82 |
# Text encoding
|
83 |
|
84 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
|
85 |
|
86 |
-
fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
|
87 |
-
load_model_as_complete(text_encoder_2, target_device=gpu)
|
88 |
|
89 |
llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
|
90 |
|
@@ -113,7 +119,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
113 |
|
114 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
|
115 |
|
116 |
-
load_model_as_complete(vae, target_device=gpu)
|
117 |
|
118 |
start_latent = vae_encode(input_image_pt, vae)
|
119 |
|
@@ -121,7 +127,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
121 |
|
122 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
|
123 |
|
124 |
-
load_model_as_complete(image_encoder, target_device=gpu)
|
125 |
|
126 |
image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
|
127 |
image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
|
@@ -172,8 +178,8 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
172 |
clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
|
173 |
clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
|
174 |
|
175 |
-
unload_complete_models()
|
176 |
-
move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
|
177 |
|
178 |
if use_teacache:
|
179 |
transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
|
@@ -235,8 +241,8 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
235 |
total_generated_latent_frames += int(generated_latents.shape[2])
|
236 |
history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
|
237 |
|
238 |
-
offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
|
239 |
-
load_model_as_complete(vae, target_device=gpu)
|
240 |
|
241 |
real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
|
242 |
|
@@ -249,7 +255,7 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
249 |
current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
|
250 |
history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
|
251 |
|
252 |
-
unload_complete_models()
|
253 |
|
254 |
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
255 |
|
@@ -264,9 +270,9 @@ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_wind
|
|
264 |
except:
|
265 |
traceback.print_exc()
|
266 |
|
267 |
-
unload_complete_models(
|
268 |
-
|
269 |
-
)
|
270 |
|
271 |
stream.output_queue.push(('end', None))
|
272 |
return
|
|
|
56 |
image_encoder.requires_grad_(False)
|
57 |
transformer.requires_grad_(False)
|
58 |
|
59 |
+
# DynamicSwapInstaller.install_model(transformer, device=gpu)
|
60 |
+
# DynamicSwapInstaller.install_model(text_encoder, device=gpu)
|
61 |
+
|
62 |
+
text_encoder.to(gpu)
|
63 |
+
text_encoder_2.to(gpu)
|
64 |
+
image_encoder.to(gpu)
|
65 |
+
vae.to(gpu)
|
66 |
+
transformer.to(gpu)
|
67 |
|
68 |
stream = AsyncStream()
|
69 |
|
|
|
81 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
|
82 |
|
83 |
try:
|
84 |
+
# unload_complete_models(
|
85 |
+
# text_encoder, text_encoder_2, image_encoder, vae, transformer
|
86 |
+
# )
|
87 |
|
88 |
# Text encoding
|
89 |
|
90 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
|
91 |
|
92 |
+
# fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
|
93 |
+
# load_model_as_complete(text_encoder_2, target_device=gpu)
|
94 |
|
95 |
llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
|
96 |
|
|
|
119 |
|
120 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
|
121 |
|
122 |
+
# load_model_as_complete(vae, target_device=gpu)
|
123 |
|
124 |
start_latent = vae_encode(input_image_pt, vae)
|
125 |
|
|
|
127 |
|
128 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
|
129 |
|
130 |
+
# load_model_as_complete(image_encoder, target_device=gpu)
|
131 |
|
132 |
image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
|
133 |
image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
|
|
|
178 |
clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
|
179 |
clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
|
180 |
|
181 |
+
# unload_complete_models()
|
182 |
+
# move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
|
183 |
|
184 |
if use_teacache:
|
185 |
transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
|
|
|
241 |
total_generated_latent_frames += int(generated_latents.shape[2])
|
242 |
history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
|
243 |
|
244 |
+
# offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
|
245 |
+
# load_model_as_complete(vae, target_device=gpu)
|
246 |
|
247 |
real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
|
248 |
|
|
|
255 |
current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
|
256 |
history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
|
257 |
|
258 |
+
# unload_complete_models()
|
259 |
|
260 |
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
261 |
|
|
|
270 |
except:
|
271 |
traceback.print_exc()
|
272 |
|
273 |
+
# unload_complete_models(
|
274 |
+
# text_encoder, text_encoder_2, image_encoder, vae, transformer
|
275 |
+
# )
|
276 |
|
277 |
stream.output_queue.push(('end', None))
|
278 |
return
|