Spaces:

toninio19
/

keysync-demo

Running on Zero

App Files Files Community

Antoni Bigata commited on 21 days ago

Commit

7746897

1 Parent(s): 2fb3e22

addapt for zerogpu

Browse files

Files changed (1) hide show

app.py +46 -31

app.py CHANGED Viewed

@@ -23,6 +23,7 @@ from inference_functions import (
 )
 from wordle_game import WordleGame
 import torch.cuda.amp as amp  # Import amp for mixed precision
 # Set default tensor type to float16 for faster computation
@@ -96,10 +97,26 @@ def load_model(
     return model
-# keyframe_model = KeyframeModel(device=device)
-# interpolation_model = InterpolationModel(device=device)
-vae_model = VaeWrapper("video")
-if torch.cuda.is_available():
     vae_model = vae_model.half()  # Convert to half precision
     try:
         vae_model = torch.compile(vae_model)
@@ -107,8 +124,7 @@ if torch.cuda.is_available():
     except Exception as e:
         print(f"Warning: Failed to compile vae_model: {e}")
-hubert_model = HubertModel.from_pretrained("facebook/hubert-base-ls960").cuda()
-if torch.cuda.is_available():
     hubert_model = hubert_model.half()  # Convert to half precision
     try:
         hubert_model = torch.compile(hubert_model)
@@ -116,13 +132,13 @@ if torch.cuda.is_available():
     except Exception as e:
         print(f"Warning: Failed to compile hubert_model: {e}")
-wavlm_model = WavLM_wrapper(
-    model_size="Base+",
-    feed_as_frames=False,
-    merge_type="None",
-    model_path="/vol/paramonos2/projects/antoni/code/Personal/code_prep/keysync/pretrained_models/checkpoints/WavLM-Base+.pt",
-).cuda()
-if torch.cuda.is_available():
     wavlm_model = wavlm_model.half()  # Convert to half precision
     try:
         wavlm_model = torch.compile(wavlm_model)
@@ -130,27 +146,23 @@ if torch.cuda.is_available():
     except Exception as e:
         print(f"Warning: Failed to compile wavlm_model: {e}")
-landmarks_extractor = LandmarksExtractor()
-# keyframe_model = load_model(
-#     config="/vol/paramonos2/projects/antoni/code/Personal/code_prep/keysync/scripts/sampling/configs/keyframe.yaml",
-#     ckpt="/vol/paramonos2/projects/antoni/code/Personal/code_prep/keysync/pretrained_models/checkpoints/keyframe_dub.pt",
-# )
-# interpolation_model = load_model(
-#     config="/vol/paramonos2/projects/antoni/code/Personal/code_prep/keysync/scripts/sampling/configs/interpolation.yaml",
-#     ckpt="/vol/paramonos2/projects/antoni/code/Personal/code_prep/keysync/pretrained_models/checkpoints/interpolation_dub.pt",
-# )
-# keyframe_model.en_and_decode_n_samples_a_time = 2
-# interpolation_model.en_and_decode_n_samples_a_time = 2
-# Default media paths
-DEFAULT_VIDEO_PATH = os.path.join(
-    os.path.dirname(__file__), "assets", "sample_video.mp4"
-)
-DEFAULT_AUDIO_PATH = os.path.join(
-    os.path.dirname(__file__), "assets", "sample_audio.wav"
-)
 @torch.no_grad()
 def compute_video_embedding(video_reader, min_len):
     """Compute embeddings from video"""
@@ -200,6 +212,7 @@ def compute_video_embedding(video_reader, min_len):
     return encoded, video_frames
 @torch.no_grad()
 def compute_hubert_embedding(raw_audio):
     """Compute embeddings from audio"""
@@ -246,6 +259,7 @@ def compute_hubert_embedding(raw_audio):
     return audio_embeddings
 @torch.no_grad()
 def compute_wavlm_embedding(raw_audio):
     """Compute embeddings from audio"""
@@ -352,6 +366,7 @@ def extract_video_landmarks(video_frames):
     return np.array(processed_landmarks)
 @torch.no_grad()
 def sample(
     audio_list,

 )
 from wordle_game import WordleGame
 import torch.cuda.amp as amp  # Import amp for mixed precision
+import spaces
 # Set default tensor type to float16 for faster computation
     return model
+# Default media paths
+DEFAULT_VIDEO_PATH = os.path.join(
+    os.path.dirname(__file__), "assets", "sample_video.mp4"
+)
+DEFAULT_AUDIO_PATH = os.path.join(
+    os.path.dirname(__file__), "assets", "sample_audio.wav"
+)
+@spaces.GPU(duration=60)
+def load_all_models():
+    global \
+        keyframe_model, \
+        interpolation_model, \
+        vae_model, \
+        hubert_model, \
+        wavlm_model, \
+        landmarks_extractor
+    vae_model = VaeWrapper("video")
     vae_model = vae_model.half()  # Convert to half precision
     try:
         vae_model = torch.compile(vae_model)
     except Exception as e:
         print(f"Warning: Failed to compile vae_model: {e}")
+    hubert_model = HubertModel.from_pretrained("facebook/hubert-base-ls960").cuda()
     hubert_model = hubert_model.half()  # Convert to half precision
     try:
         hubert_model = torch.compile(hubert_model)
     except Exception as e:
         print(f"Warning: Failed to compile hubert_model: {e}")
+    wavlm_model = WavLM_wrapper(
+        model_size="Base+",
+        feed_as_frames=False,
+        merge_type="None",
+        model_path="/vol/paramonos2/projects/antoni/code/Personal/code_prep/keysync/pretrained_models/checkpoints/WavLM-Base+.pt",
+    ).cuda()
     wavlm_model = wavlm_model.half()  # Convert to half precision
     try:
         wavlm_model = torch.compile(wavlm_model)
     except Exception as e:
         print(f"Warning: Failed to compile wavlm_model: {e}")
+    landmarks_extractor = LandmarksExtractor()
+    keyframe_model = load_model(
+        config="/vol/paramonos2/projects/antoni/code/Personal/code_prep/keysync/scripts/sampling/configs/keyframe.yaml",
+        ckpt="/vol/paramonos2/projects/antoni/code/Personal/code_prep/keysync/pretrained_models/checkpoints/keyframe_dub.pt",
+    )
+    interpolation_model = load_model(
+        config="/vol/paramonos2/projects/antoni/code/Personal/code_prep/keysync/scripts/sampling/configs/interpolation.yaml",
+        ckpt="/vol/paramonos2/projects/antoni/code/Personal/code_prep/keysync/pretrained_models/checkpoints/interpolation_dub.pt",
+    )
+    keyframe_model.en_and_decode_n_samples_a_time = 2
+    interpolation_model.en_and_decode_n_samples_a_time = 2
+load_all_models()
+@spaces.GPU(duration=60)
 @torch.no_grad()
 def compute_video_embedding(video_reader, min_len):
     """Compute embeddings from video"""
     return encoded, video_frames
+@spaces.GPU(duration=120)
 @torch.no_grad()
 def compute_hubert_embedding(raw_audio):
     """Compute embeddings from audio"""
     return audio_embeddings
+@spaces.GPU(duration=120)
 @torch.no_grad()
 def compute_wavlm_embedding(raw_audio):
     """Compute embeddings from audio"""
     return np.array(processed_landmarks)
+@spaces.GPU(duration=600)
 @torch.no_grad()
 def sample(
     audio_list,