none commited on
Commit
37ac125
·
1 Parent(s): 5b835a5
Files changed (3) hide show
  1. .gitignore +2 -0
  2. app.py +1 -3
  3. src/moviedubber/infer_with_mmlm_result.py +9 -249
.gitignore CHANGED
@@ -178,3 +178,5 @@ results/
178
  *mp4
179
  temp/
180
  src/moviedubber/infer/basic_test.toml
 
 
 
178
  *mp4
179
  temp/
180
  src/moviedubber/infer/basic_test.toml
181
+
182
+ upload.py
app.py CHANGED
@@ -43,10 +43,8 @@ def load_asr_model(model_id="openai/whisper-large-v3-turbo"):
43
 
44
 
45
  device = "cpu"
46
- config = tomli.load(open("src/moviedubber/infer/basic.toml", "rb"))
47
 
48
-
49
- ema_model, vocoder, ort_session = load_models(config, device=device)
50
  asr_pipe = load_asr_model()
51
 
52
  videofeature_extractor = VideoFeatureExtractor(device=device)
 
43
 
44
 
45
  device = "cpu"
 
46
 
47
+ ema_model, vocoder, ort_session = load_models(device=device)
 
48
  asr_pipe = load_asr_model()
49
 
50
  videofeature_extractor = VideoFeatureExtractor(device=device)
src/moviedubber/infer_with_mmlm_result.py CHANGED
@@ -1,40 +1,17 @@
1
- import argparse
2
  import os
3
- import os.path as osp
4
- import random
5
- import sys
6
- from pathlib import Path
7
 
8
- import numpy as np
9
  import onnxruntime
10
- import soundfile
11
- import tomli
12
- import torch
13
- import torch.nn.functional as F
14
  import torchaudio
15
  import torchaudio.compliance.kaldi as kaldi
 
16
  from moviepy import AudioFileClip, VideoFileClip
17
  from omegaconf import OmegaConf
18
- from pydub import AudioSegment
19
- from tqdm import tqdm
20
-
21
-
22
- src_path = Path(osp.dirname(__file__)).parent.parent
23
- sys.path.insert(0, str(src_path))
24
- sys.path.append(str(src_path / "src/third_party/BigVGAN"))
25
 
26
  from src.moviedubber.infer.utils_infer import (
27
- cfg_strength,
28
- chunk_text,
29
  load_model,
30
  load_vocoder,
31
- mel_spec_type,
32
- nfe_step,
33
- sway_sampling_coef,
34
  )
35
- from src.moviedubber.infer.video_preprocess import VideoFeatureExtractor
36
  from src.moviedubber.model import ControlNetDiT, DiT
37
- from src.moviedubber.model.utils import convert_char_to_pinyin
38
 
39
 
40
  def concat_movie_with_audio(wav, video_path, out_dir):
@@ -88,20 +65,13 @@ def get_spk_emb(audio_path, ort_session):
88
  return embedding
89
 
90
 
91
- def load_models(config, device):
92
- model_cfg = config.get("model_cfg", "src/moviedubber/configs/basemodel.yaml")
93
- ckpt_file = config.get("ckpt_file", None)
94
- campplus_path = config.get("campplus_path", None)
95
- vocab_file = config.get("vocab_file", None)
96
-
97
- vocoder_local_path = config.get("vocoder_local_path", None)
98
 
99
- if ckpt_file is None or vocab_file is None or vocoder_local_path is None or campplus_path is None:
100
- raise ValueError("ckpt_file, vocab_file and vocoder_local_path must be specified")
101
 
102
- vocoder_name = config.get("vocoder_name", mel_spec_type)
103
-
104
- vocoder = load_vocoder(local_path=vocoder_local_path, device=device)
105
 
106
  model_cls = DiT
107
  model_cfg = OmegaConf.load(model_cfg).model.arch
@@ -110,9 +80,9 @@ def load_models(config, device):
110
  ema_model = load_model(
111
  model_cls,
112
  model_cfg,
113
- ckpt_file,
114
  mel_spec_type=vocoder_name,
115
- vocab_file=vocab_file,
116
  controlnet=controlnet,
117
  device=device,
118
  )
@@ -122,218 +92,8 @@ def load_models(config, device):
122
  option.intra_op_num_threads = 1
123
  providers = ["CPUExecutionProvider"]
124
  ort_session = onnxruntime.InferenceSession(
125
- campplus_path,
126
  sess_options=option,
127
  providers=providers,
128
  )
129
  return ema_model, vocoder, ort_session
130
-
131
-
132
- def main(config, device, chunk, gen_dir, target_dir, out_dir, idx):
133
- ema_model, vocoder, ort_session = load_models(config, device=device)
134
-
135
- videofeature_extractor = VideoFeatureExtractor(device=device)
136
-
137
- for it in tqdm(chunk, total=len(chunk), position=idx, desc=f"Processing {idx}"):
138
- wav, video, text, ref_wav = it
139
-
140
- with open(f"{target_dir}/{wav.split('/')[-1].split('.')[0]}.txt", "a") as f:
141
- f.write(text + "\n")
142
-
143
- if wav.endswith(".mp3"):
144
- audio = AudioSegment.from_mp3(wav)
145
-
146
- wav_file = wav.replace(".mp3", ".wav")
147
- audio.export(wav_file, format="wav")
148
-
149
- wav = Path(wav).with_suffix(".wav")
150
- if wav.exists() is False:
151
- continue
152
-
153
- os.system(f"cp {wav} {target_dir}/")
154
-
155
- gen_audio, sr = torchaudio.load(str(wav))
156
- resampler = torchaudio.transforms.Resample(sr, 24000)
157
- if sr != 24000:
158
- gen_audio = resampler(gen_audio)
159
-
160
- if gen_audio.shape[0] > 1:
161
- gen_audio = torch.mean(gen_audio, dim=0, keepdim=True)
162
-
163
- gen_video = video
164
- gen_clip_path = gen_video.replace(".mp4", ".clip")
165
-
166
- if not os.path.exists(gen_clip_path):
167
- gen_clip = videofeature_extractor.extract_features(gen_video)
168
-
169
- torch.save(gen_clip.detach().cpu(), gen_clip_path)
170
-
171
- else:
172
- gen_clip = torch.load(gen_clip_path, weights_only=True).to(device=device, dtype=torch.float32)
173
-
174
- if ref_wav == "None":
175
- use_ref_audio = False
176
- gen_text_ = text
177
-
178
- gen_clip_ = gen_clip
179
-
180
- ref_audio_ = gen_audio
181
-
182
- spk_emb = torch.zeros(1, 1, 192).to(device=device, dtype=torch.float32)
183
-
184
- else:
185
- use_ref_audio = True
186
- ref_audio = Path(ref_wav)
187
-
188
- spk_emb = get_spk_emb(ref_audio, ort_session)
189
- spk_emb = torch.tensor(spk_emb).to(device=device, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
190
-
191
- ref_text = ref_audio.with_suffix(".txt").read_text().strip()
192
- gen_text_ = ref_text + " " + text
193
-
194
- if ref_audio.exists() is False:
195
- raise Exception(f"ref_audio {ref_audio} not found")
196
-
197
- if ref_audio.suffix == ".mp3":
198
- audio = AudioSegment.from_mp3(ref_audio)
199
-
200
- wav_file = ref_audio.with_suffix(".wav")
201
- audio.export(wav_file, format="wav")
202
-
203
- ref_audio_, _ = torchaudio.load(str(ref_audio.with_suffix(".wav")))
204
- resampler = torchaudio.transforms.Resample(sr, 24000)
205
- if sr != 24000:
206
- ref_audio_ = resampler(ref_audio_)
207
-
208
- if ref_audio_.shape[0] > 1:
209
- ref_audio_ = torch.mean(ref_audio_, dim=0, keepdim=True)
210
-
211
- ref_video = ref_audio.with_suffix(".mp4")
212
- ref_clip_path = ref_video.with_suffix(".clip")
213
-
214
- if not ref_clip_path.exists():
215
- ref_clip = videofeature_extractor.extract_features(str(ref_video))
216
-
217
- torch.save(ref_clip.detach().cpu(), ref_clip_path)
218
-
219
- else:
220
- ref_clip = torch.load(ref_clip_path, weights_only=True).to(device=device, dtype=torch.float32)
221
-
222
- gen_clip_ = torch.cat([ref_clip, gen_clip], dim=0)
223
-
224
- gen_audio_len = gen_audio.shape[1] // 256
225
-
226
- if use_ref_audio:
227
- ref_audio_len = ref_audio_.shape[1] // 256
228
- duration = ref_audio_len + gen_audio_len
229
- else:
230
- duration = gen_audio_len
231
-
232
- gen_clip_ = gen_clip_.unsqueeze(0).to(device=device, dtype=torch.float32).transpose(1, 2)
233
- gen_clip_ = F.interpolate(gen_clip_, size=duration, mode="linear", align_corners=False).transpose(1, 2)
234
-
235
- gen_text_batches = chunk_text(gen_text_, max_chars=1024)
236
- final_text_list = convert_char_to_pinyin(gen_text_batches)
237
-
238
- with torch.inference_mode():
239
- generated, _ = ema_model.sample(
240
- cond=ref_audio_.to(device),
241
- text=final_text_list,
242
- clip=gen_clip_,
243
- spk_emb=spk_emb,
244
- duration=duration,
245
- steps=nfe_step,
246
- cfg_strength=cfg_strength,
247
- sway_sampling_coef=sway_sampling_coef,
248
- no_ref_audio=not use_ref_audio,
249
- )
250
-
251
- generated = generated.to(torch.float32)
252
-
253
- if use_ref_audio:
254
- generated = generated[:, ref_audio_len:, :]
255
-
256
- generated_mel_spec = generated.permute(0, 2, 1)
257
- generated_wave = vocoder(generated_mel_spec)
258
-
259
- generated_wave = generated_wave.squeeze().cpu().numpy()
260
-
261
- out_path = osp.join(gen_dir, f"{wav.stem}.wav")
262
- soundfile.write(out_path, generated_wave, samplerate=24000)
263
- _ = concat_movie_with_audio(out_path, gen_video, out_dir)
264
-
265
-
266
- if __name__ == "__main__":
267
- import torch.multiprocessing as mp
268
-
269
- parser = argparse.ArgumentParser(
270
- prog="python3 infer-cli.py",
271
- description="Commandline interface for moviedubber infer with Advanced Batch Processing.",
272
- epilog="Specify options above to override one or more settings from config.",
273
- )
274
- parser.add_argument(
275
- "-c",
276
- "--config",
277
- type=str,
278
- default="src/moviedubber/infer/basic.toml",
279
- help="The configuration file, default see infer/basic.toml",
280
- )
281
- parser.add_argument("-i", "--input_list", type=str, required=True, help="The val list file")
282
- parser.add_argument("-s", "--ref_spk_list", type=str, required=True, help="The spk list file")
283
- parser.add_argument("-o", "--out_dir", type=str, default="data/dubberout", help="The output directory")
284
- parser.add_argument("--gpuids", type=str, help="GPU ids to use, split by comma")
285
- parser.add_argument("--nums_workers", type=int, default=1, help="Number of workers for per gpu")
286
-
287
- args = parser.parse_args()
288
-
289
- out_dir = args.out_dir
290
- input_list = args.input_list
291
- gpu_ids = args.gpuids.split(",") if args.gpuids else ["0"]
292
- num_pre = args.nums_workers
293
- spk_ref_path = args.ref_spk_list
294
-
295
- config = tomli.load(open(args.config, "rb"))
296
-
297
- gen_lst = Path(input_list).read_text().splitlines()[1:]
298
-
299
- gen_pre_conf = []
300
-
301
- spk_lines = Path(spk_ref_path).read_text().splitlines()
302
-
303
- for idx, line in enumerate(gen_lst):
304
- if line.strip():
305
- mp4_path, is_correc, _, _ = line.split(",")
306
-
307
- wav_path = mp4_path.replace(".mp4", ".mp3")
308
- text = Path(wav_path.replace(".mp3", ".txt")).read_text().strip()
309
-
310
- if is_correc == "True":
311
- ref_wav = spk_lines[idx].split(",")[1].strip()
312
- else:
313
- ref_wav = random.choice(spk_lines).split(",")[-1].strip() # Use random speaker for incorrect samples
314
-
315
- gen_pre_conf.append([wav_path, mp4_path, text, ref_wav])
316
-
317
- chunks = np.array_split(gen_pre_conf, len(gpu_ids) * num_pre)
318
-
319
- gen_dir = os.path.join(out_dir, "generated")
320
- target_dir = os.path.join(out_dir, "target")
321
-
322
- if os.path.exists(gen_dir) is False or os.path.exists(target_dir) is False:
323
- os.makedirs(gen_dir)
324
- os.makedirs(target_dir)
325
-
326
- mp.set_start_method("spawn", force=True)
327
- processes = []
328
- for idx, chunk in enumerate(chunks):
329
- device = gpu_ids[idx % len(gpu_ids)]
330
-
331
- device = f"cuda:{device}"
332
- p = mp.Process(target=main, args=(config, device, chunk, gen_dir, target_dir, out_dir, idx))
333
- processes.append(p)
334
- p.start()
335
-
336
- for process in processes:
337
- process.join()
338
-
339
- print("All processes finished.")
 
 
1
  import os
 
 
 
 
2
 
 
3
  import onnxruntime
 
 
 
 
4
  import torchaudio
5
  import torchaudio.compliance.kaldi as kaldi
6
+ from huggingface_hub import hf_hub_download
7
  from moviepy import AudioFileClip, VideoFileClip
8
  from omegaconf import OmegaConf
 
 
 
 
 
 
 
9
 
10
  from src.moviedubber.infer.utils_infer import (
 
 
11
  load_model,
12
  load_vocoder,
 
 
 
13
  )
 
14
  from src.moviedubber.model import ControlNetDiT, DiT
 
15
 
16
 
17
  def concat_movie_with_audio(wav, video_path, out_dir):
 
65
  return embedding
66
 
67
 
68
+ def load_models(device):
69
+ model_cfg = "src/moviedubber/configs/basemodel.yaml"
70
+ vocoder_name = "bigvgan"
 
 
 
 
71
 
72
+ vocoder = load_vocoder(local_path="nvidia/bigvgan_v2_24khz_100band_256x", device=device)
 
73
 
74
+ model_path = hf_hub_download(repo_id="woak-oa/DeepDubber-V1")
 
 
75
 
76
  model_cls = DiT
77
  model_cfg = OmegaConf.load(model_cfg).model.arch
 
80
  ema_model = load_model(
81
  model_cls,
82
  model_cfg,
83
+ ckpt_path=f"{model_path}/mmdubber.pt",
84
  mel_spec_type=vocoder_name,
85
+ vocab_file=f"{model_path}/vocab.txt",
86
  controlnet=controlnet,
87
  device=device,
88
  )
 
92
  option.intra_op_num_threads = 1
93
  providers = ["CPUExecutionProvider"]
94
  ort_session = onnxruntime.InferenceSession(
95
+ f"{model_path}/campplus.onnx",
96
  sess_options=option,
97
  providers=providers,
98
  )
99
  return ema_model, vocoder, ort_session