Spaces:
Running
Running
none
commited on
Commit
·
37ac125
1
Parent(s):
5b835a5
init
Browse files- .gitignore +2 -0
- app.py +1 -3
- src/moviedubber/infer_with_mmlm_result.py +9 -249
.gitignore
CHANGED
@@ -178,3 +178,5 @@ results/
|
|
178 |
*mp4
|
179 |
temp/
|
180 |
src/moviedubber/infer/basic_test.toml
|
|
|
|
|
|
178 |
*mp4
|
179 |
temp/
|
180 |
src/moviedubber/infer/basic_test.toml
|
181 |
+
|
182 |
+
upload.py
|
app.py
CHANGED
@@ -43,10 +43,8 @@ def load_asr_model(model_id="openai/whisper-large-v3-turbo"):
|
|
43 |
|
44 |
|
45 |
device = "cpu"
|
46 |
-
config = tomli.load(open("src/moviedubber/infer/basic.toml", "rb"))
|
47 |
|
48 |
-
|
49 |
-
ema_model, vocoder, ort_session = load_models(config, device=device)
|
50 |
asr_pipe = load_asr_model()
|
51 |
|
52 |
videofeature_extractor = VideoFeatureExtractor(device=device)
|
|
|
43 |
|
44 |
|
45 |
device = "cpu"
|
|
|
46 |
|
47 |
+
ema_model, vocoder, ort_session = load_models(device=device)
|
|
|
48 |
asr_pipe = load_asr_model()
|
49 |
|
50 |
videofeature_extractor = VideoFeatureExtractor(device=device)
|
src/moviedubber/infer_with_mmlm_result.py
CHANGED
@@ -1,40 +1,17 @@
|
|
1 |
-
import argparse
|
2 |
import os
|
3 |
-
import os.path as osp
|
4 |
-
import random
|
5 |
-
import sys
|
6 |
-
from pathlib import Path
|
7 |
|
8 |
-
import numpy as np
|
9 |
import onnxruntime
|
10 |
-
import soundfile
|
11 |
-
import tomli
|
12 |
-
import torch
|
13 |
-
import torch.nn.functional as F
|
14 |
import torchaudio
|
15 |
import torchaudio.compliance.kaldi as kaldi
|
|
|
16 |
from moviepy import AudioFileClip, VideoFileClip
|
17 |
from omegaconf import OmegaConf
|
18 |
-
from pydub import AudioSegment
|
19 |
-
from tqdm import tqdm
|
20 |
-
|
21 |
-
|
22 |
-
src_path = Path(osp.dirname(__file__)).parent.parent
|
23 |
-
sys.path.insert(0, str(src_path))
|
24 |
-
sys.path.append(str(src_path / "src/third_party/BigVGAN"))
|
25 |
|
26 |
from src.moviedubber.infer.utils_infer import (
|
27 |
-
cfg_strength,
|
28 |
-
chunk_text,
|
29 |
load_model,
|
30 |
load_vocoder,
|
31 |
-
mel_spec_type,
|
32 |
-
nfe_step,
|
33 |
-
sway_sampling_coef,
|
34 |
)
|
35 |
-
from src.moviedubber.infer.video_preprocess import VideoFeatureExtractor
|
36 |
from src.moviedubber.model import ControlNetDiT, DiT
|
37 |
-
from src.moviedubber.model.utils import convert_char_to_pinyin
|
38 |
|
39 |
|
40 |
def concat_movie_with_audio(wav, video_path, out_dir):
|
@@ -88,20 +65,13 @@ def get_spk_emb(audio_path, ort_session):
|
|
88 |
return embedding
|
89 |
|
90 |
|
91 |
-
def load_models(
|
92 |
-
model_cfg =
|
93 |
-
|
94 |
-
campplus_path = config.get("campplus_path", None)
|
95 |
-
vocab_file = config.get("vocab_file", None)
|
96 |
-
|
97 |
-
vocoder_local_path = config.get("vocoder_local_path", None)
|
98 |
|
99 |
-
|
100 |
-
raise ValueError("ckpt_file, vocab_file and vocoder_local_path must be specified")
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
vocoder = load_vocoder(local_path=vocoder_local_path, device=device)
|
105 |
|
106 |
model_cls = DiT
|
107 |
model_cfg = OmegaConf.load(model_cfg).model.arch
|
@@ -110,9 +80,9 @@ def load_models(config, device):
|
|
110 |
ema_model = load_model(
|
111 |
model_cls,
|
112 |
model_cfg,
|
113 |
-
|
114 |
mel_spec_type=vocoder_name,
|
115 |
-
vocab_file=
|
116 |
controlnet=controlnet,
|
117 |
device=device,
|
118 |
)
|
@@ -122,218 +92,8 @@ def load_models(config, device):
|
|
122 |
option.intra_op_num_threads = 1
|
123 |
providers = ["CPUExecutionProvider"]
|
124 |
ort_session = onnxruntime.InferenceSession(
|
125 |
-
|
126 |
sess_options=option,
|
127 |
providers=providers,
|
128 |
)
|
129 |
return ema_model, vocoder, ort_session
|
130 |
-
|
131 |
-
|
132 |
-
def main(config, device, chunk, gen_dir, target_dir, out_dir, idx):
|
133 |
-
ema_model, vocoder, ort_session = load_models(config, device=device)
|
134 |
-
|
135 |
-
videofeature_extractor = VideoFeatureExtractor(device=device)
|
136 |
-
|
137 |
-
for it in tqdm(chunk, total=len(chunk), position=idx, desc=f"Processing {idx}"):
|
138 |
-
wav, video, text, ref_wav = it
|
139 |
-
|
140 |
-
with open(f"{target_dir}/{wav.split('/')[-1].split('.')[0]}.txt", "a") as f:
|
141 |
-
f.write(text + "\n")
|
142 |
-
|
143 |
-
if wav.endswith(".mp3"):
|
144 |
-
audio = AudioSegment.from_mp3(wav)
|
145 |
-
|
146 |
-
wav_file = wav.replace(".mp3", ".wav")
|
147 |
-
audio.export(wav_file, format="wav")
|
148 |
-
|
149 |
-
wav = Path(wav).with_suffix(".wav")
|
150 |
-
if wav.exists() is False:
|
151 |
-
continue
|
152 |
-
|
153 |
-
os.system(f"cp {wav} {target_dir}/")
|
154 |
-
|
155 |
-
gen_audio, sr = torchaudio.load(str(wav))
|
156 |
-
resampler = torchaudio.transforms.Resample(sr, 24000)
|
157 |
-
if sr != 24000:
|
158 |
-
gen_audio = resampler(gen_audio)
|
159 |
-
|
160 |
-
if gen_audio.shape[0] > 1:
|
161 |
-
gen_audio = torch.mean(gen_audio, dim=0, keepdim=True)
|
162 |
-
|
163 |
-
gen_video = video
|
164 |
-
gen_clip_path = gen_video.replace(".mp4", ".clip")
|
165 |
-
|
166 |
-
if not os.path.exists(gen_clip_path):
|
167 |
-
gen_clip = videofeature_extractor.extract_features(gen_video)
|
168 |
-
|
169 |
-
torch.save(gen_clip.detach().cpu(), gen_clip_path)
|
170 |
-
|
171 |
-
else:
|
172 |
-
gen_clip = torch.load(gen_clip_path, weights_only=True).to(device=device, dtype=torch.float32)
|
173 |
-
|
174 |
-
if ref_wav == "None":
|
175 |
-
use_ref_audio = False
|
176 |
-
gen_text_ = text
|
177 |
-
|
178 |
-
gen_clip_ = gen_clip
|
179 |
-
|
180 |
-
ref_audio_ = gen_audio
|
181 |
-
|
182 |
-
spk_emb = torch.zeros(1, 1, 192).to(device=device, dtype=torch.float32)
|
183 |
-
|
184 |
-
else:
|
185 |
-
use_ref_audio = True
|
186 |
-
ref_audio = Path(ref_wav)
|
187 |
-
|
188 |
-
spk_emb = get_spk_emb(ref_audio, ort_session)
|
189 |
-
spk_emb = torch.tensor(spk_emb).to(device=device, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
|
190 |
-
|
191 |
-
ref_text = ref_audio.with_suffix(".txt").read_text().strip()
|
192 |
-
gen_text_ = ref_text + " " + text
|
193 |
-
|
194 |
-
if ref_audio.exists() is False:
|
195 |
-
raise Exception(f"ref_audio {ref_audio} not found")
|
196 |
-
|
197 |
-
if ref_audio.suffix == ".mp3":
|
198 |
-
audio = AudioSegment.from_mp3(ref_audio)
|
199 |
-
|
200 |
-
wav_file = ref_audio.with_suffix(".wav")
|
201 |
-
audio.export(wav_file, format="wav")
|
202 |
-
|
203 |
-
ref_audio_, _ = torchaudio.load(str(ref_audio.with_suffix(".wav")))
|
204 |
-
resampler = torchaudio.transforms.Resample(sr, 24000)
|
205 |
-
if sr != 24000:
|
206 |
-
ref_audio_ = resampler(ref_audio_)
|
207 |
-
|
208 |
-
if ref_audio_.shape[0] > 1:
|
209 |
-
ref_audio_ = torch.mean(ref_audio_, dim=0, keepdim=True)
|
210 |
-
|
211 |
-
ref_video = ref_audio.with_suffix(".mp4")
|
212 |
-
ref_clip_path = ref_video.with_suffix(".clip")
|
213 |
-
|
214 |
-
if not ref_clip_path.exists():
|
215 |
-
ref_clip = videofeature_extractor.extract_features(str(ref_video))
|
216 |
-
|
217 |
-
torch.save(ref_clip.detach().cpu(), ref_clip_path)
|
218 |
-
|
219 |
-
else:
|
220 |
-
ref_clip = torch.load(ref_clip_path, weights_only=True).to(device=device, dtype=torch.float32)
|
221 |
-
|
222 |
-
gen_clip_ = torch.cat([ref_clip, gen_clip], dim=0)
|
223 |
-
|
224 |
-
gen_audio_len = gen_audio.shape[1] // 256
|
225 |
-
|
226 |
-
if use_ref_audio:
|
227 |
-
ref_audio_len = ref_audio_.shape[1] // 256
|
228 |
-
duration = ref_audio_len + gen_audio_len
|
229 |
-
else:
|
230 |
-
duration = gen_audio_len
|
231 |
-
|
232 |
-
gen_clip_ = gen_clip_.unsqueeze(0).to(device=device, dtype=torch.float32).transpose(1, 2)
|
233 |
-
gen_clip_ = F.interpolate(gen_clip_, size=duration, mode="linear", align_corners=False).transpose(1, 2)
|
234 |
-
|
235 |
-
gen_text_batches = chunk_text(gen_text_, max_chars=1024)
|
236 |
-
final_text_list = convert_char_to_pinyin(gen_text_batches)
|
237 |
-
|
238 |
-
with torch.inference_mode():
|
239 |
-
generated, _ = ema_model.sample(
|
240 |
-
cond=ref_audio_.to(device),
|
241 |
-
text=final_text_list,
|
242 |
-
clip=gen_clip_,
|
243 |
-
spk_emb=spk_emb,
|
244 |
-
duration=duration,
|
245 |
-
steps=nfe_step,
|
246 |
-
cfg_strength=cfg_strength,
|
247 |
-
sway_sampling_coef=sway_sampling_coef,
|
248 |
-
no_ref_audio=not use_ref_audio,
|
249 |
-
)
|
250 |
-
|
251 |
-
generated = generated.to(torch.float32)
|
252 |
-
|
253 |
-
if use_ref_audio:
|
254 |
-
generated = generated[:, ref_audio_len:, :]
|
255 |
-
|
256 |
-
generated_mel_spec = generated.permute(0, 2, 1)
|
257 |
-
generated_wave = vocoder(generated_mel_spec)
|
258 |
-
|
259 |
-
generated_wave = generated_wave.squeeze().cpu().numpy()
|
260 |
-
|
261 |
-
out_path = osp.join(gen_dir, f"{wav.stem}.wav")
|
262 |
-
soundfile.write(out_path, generated_wave, samplerate=24000)
|
263 |
-
_ = concat_movie_with_audio(out_path, gen_video, out_dir)
|
264 |
-
|
265 |
-
|
266 |
-
if __name__ == "__main__":
|
267 |
-
import torch.multiprocessing as mp
|
268 |
-
|
269 |
-
parser = argparse.ArgumentParser(
|
270 |
-
prog="python3 infer-cli.py",
|
271 |
-
description="Commandline interface for moviedubber infer with Advanced Batch Processing.",
|
272 |
-
epilog="Specify options above to override one or more settings from config.",
|
273 |
-
)
|
274 |
-
parser.add_argument(
|
275 |
-
"-c",
|
276 |
-
"--config",
|
277 |
-
type=str,
|
278 |
-
default="src/moviedubber/infer/basic.toml",
|
279 |
-
help="The configuration file, default see infer/basic.toml",
|
280 |
-
)
|
281 |
-
parser.add_argument("-i", "--input_list", type=str, required=True, help="The val list file")
|
282 |
-
parser.add_argument("-s", "--ref_spk_list", type=str, required=True, help="The spk list file")
|
283 |
-
parser.add_argument("-o", "--out_dir", type=str, default="data/dubberout", help="The output directory")
|
284 |
-
parser.add_argument("--gpuids", type=str, help="GPU ids to use, split by comma")
|
285 |
-
parser.add_argument("--nums_workers", type=int, default=1, help="Number of workers for per gpu")
|
286 |
-
|
287 |
-
args = parser.parse_args()
|
288 |
-
|
289 |
-
out_dir = args.out_dir
|
290 |
-
input_list = args.input_list
|
291 |
-
gpu_ids = args.gpuids.split(",") if args.gpuids else ["0"]
|
292 |
-
num_pre = args.nums_workers
|
293 |
-
spk_ref_path = args.ref_spk_list
|
294 |
-
|
295 |
-
config = tomli.load(open(args.config, "rb"))
|
296 |
-
|
297 |
-
gen_lst = Path(input_list).read_text().splitlines()[1:]
|
298 |
-
|
299 |
-
gen_pre_conf = []
|
300 |
-
|
301 |
-
spk_lines = Path(spk_ref_path).read_text().splitlines()
|
302 |
-
|
303 |
-
for idx, line in enumerate(gen_lst):
|
304 |
-
if line.strip():
|
305 |
-
mp4_path, is_correc, _, _ = line.split(",")
|
306 |
-
|
307 |
-
wav_path = mp4_path.replace(".mp4", ".mp3")
|
308 |
-
text = Path(wav_path.replace(".mp3", ".txt")).read_text().strip()
|
309 |
-
|
310 |
-
if is_correc == "True":
|
311 |
-
ref_wav = spk_lines[idx].split(",")[1].strip()
|
312 |
-
else:
|
313 |
-
ref_wav = random.choice(spk_lines).split(",")[-1].strip() # Use random speaker for incorrect samples
|
314 |
-
|
315 |
-
gen_pre_conf.append([wav_path, mp4_path, text, ref_wav])
|
316 |
-
|
317 |
-
chunks = np.array_split(gen_pre_conf, len(gpu_ids) * num_pre)
|
318 |
-
|
319 |
-
gen_dir = os.path.join(out_dir, "generated")
|
320 |
-
target_dir = os.path.join(out_dir, "target")
|
321 |
-
|
322 |
-
if os.path.exists(gen_dir) is False or os.path.exists(target_dir) is False:
|
323 |
-
os.makedirs(gen_dir)
|
324 |
-
os.makedirs(target_dir)
|
325 |
-
|
326 |
-
mp.set_start_method("spawn", force=True)
|
327 |
-
processes = []
|
328 |
-
for idx, chunk in enumerate(chunks):
|
329 |
-
device = gpu_ids[idx % len(gpu_ids)]
|
330 |
-
|
331 |
-
device = f"cuda:{device}"
|
332 |
-
p = mp.Process(target=main, args=(config, device, chunk, gen_dir, target_dir, out_dir, idx))
|
333 |
-
processes.append(p)
|
334 |
-
p.start()
|
335 |
-
|
336 |
-
for process in processes:
|
337 |
-
process.join()
|
338 |
-
|
339 |
-
print("All processes finished.")
|
|
|
|
|
1 |
import os
|
|
|
|
|
|
|
|
|
2 |
|
|
|
3 |
import onnxruntime
|
|
|
|
|
|
|
|
|
4 |
import torchaudio
|
5 |
import torchaudio.compliance.kaldi as kaldi
|
6 |
+
from huggingface_hub import hf_hub_download
|
7 |
from moviepy import AudioFileClip, VideoFileClip
|
8 |
from omegaconf import OmegaConf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
from src.moviedubber.infer.utils_infer import (
|
|
|
|
|
11 |
load_model,
|
12 |
load_vocoder,
|
|
|
|
|
|
|
13 |
)
|
|
|
14 |
from src.moviedubber.model import ControlNetDiT, DiT
|
|
|
15 |
|
16 |
|
17 |
def concat_movie_with_audio(wav, video_path, out_dir):
|
|
|
65 |
return embedding
|
66 |
|
67 |
|
68 |
+
def load_models(device):
|
69 |
+
model_cfg = "src/moviedubber/configs/basemodel.yaml"
|
70 |
+
vocoder_name = "bigvgan"
|
|
|
|
|
|
|
|
|
71 |
|
72 |
+
vocoder = load_vocoder(local_path="nvidia/bigvgan_v2_24khz_100band_256x", device=device)
|
|
|
73 |
|
74 |
+
model_path = hf_hub_download(repo_id="woak-oa/DeepDubber-V1")
|
|
|
|
|
75 |
|
76 |
model_cls = DiT
|
77 |
model_cfg = OmegaConf.load(model_cfg).model.arch
|
|
|
80 |
ema_model = load_model(
|
81 |
model_cls,
|
82 |
model_cfg,
|
83 |
+
ckpt_path=f"{model_path}/mmdubber.pt",
|
84 |
mel_spec_type=vocoder_name,
|
85 |
+
vocab_file=f"{model_path}/vocab.txt",
|
86 |
controlnet=controlnet,
|
87 |
device=device,
|
88 |
)
|
|
|
92 |
option.intra_op_num_threads = 1
|
93 |
providers = ["CPUExecutionProvider"]
|
94 |
ort_session = onnxruntime.InferenceSession(
|
95 |
+
f"{model_path}/campplus.onnx",
|
96 |
sess_options=option,
|
97 |
providers=providers,
|
98 |
)
|
99 |
return ema_model, vocoder, ort_session
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|