Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import copy | |
import json | |
import time | |
import torch | |
import argparse | |
import soundfile as sf | |
import wandb | |
from tqdm import tqdm | |
from diffusers import DDPMScheduler | |
from audioldm_eval import EvaluationHelper | |
from models import build_pretrained_models, AudioDiffusion | |
from transformers import AutoProcessor, ClapModel | |
import torchaudio | |
from tools.torch_tools import read_wav_file | |
from tango import Tango | |
import numpy as np | |
import librosa | |
import laion_clap | |
def clap_score_computation(wav_output_dir,text_prompts): | |
cos_sim = torch.nn.CosineSimilarity() | |
model = laion_clap.CLAP_Module(enable_fusion=False) | |
model.load_ckpt() # download the default pretrained checkpoint. | |
model.eval() | |
output_dir = wav_output_dir | |
audio_file = [ | |
"{}/output_{}.wav".format(output_dir,i) for i in range(len(text_prompts)) | |
] | |
with torch.no_grad(): | |
audio_embed = model.get_audio_embedding_from_filelist(x = audio_file, use_tensor=True).cpu() | |
with torch.no_grad(): | |
text_embed = model.get_text_embedding(text_prompts, use_tensor=True).cpu() | |
clap_score = torch.mean(cos_sim(audio_embed,text_embed)).item() | |
return clap_score | |
class dotdict(dict): | |
"""dot.notation access to dictionary attributes""" | |
__getattr__ = dict.get | |
__setattr__ = dict.__setitem__ | |
__delattr__ = dict.__delitem__ | |
def chunks(lst, n): | |
"""Yield successive n-sized chunks from lst.""" | |
for i in range(0, len(lst), n): | |
yield lst[i:i + n] | |
def parse_args(): | |
parser = argparse.ArgumentParser(description="Inference for text to audio generation task.") | |
parser.add_argument( | |
"--original_args", type=str, default=None, | |
help="Path for summary jsonl file saved during training." | |
) | |
parser.add_argument( | |
"--model", type=str, default=None, | |
help="Path for saved model bin file." | |
) | |
parser.add_argument( | |
"--test_file", type=str, default="data/test_audiocaps_subset.json", | |
help="json file containing the test prompts for generation." | |
) | |
parser.add_argument( | |
"--text_key", type=str, default="captions", | |
help="Key containing the text in the json file." | |
) | |
parser.add_argument( | |
"--test_references", type=str, default="data/audiocaps_test_references/subset", | |
help="Folder containing the test reference wav files." | |
) | |
parser.add_argument( | |
"--num_steps", type=int, default=200, | |
help="How many denoising steps for generation.", | |
) | |
parser.add_argument( | |
"--guidance", type=float, default=3, | |
help="Guidance scale for classifier free guidance." | |
) | |
parser.add_argument( | |
"--batch_size", type=int, default=8, | |
help="Batch size for generation.", | |
) | |
parser.add_argument( | |
"--num_samples", type=int, default=1, | |
help="How many samples per prompt.", | |
) | |
args = parser.parse_args() | |
return args | |
def main(): | |
args = parse_args() | |
train_args = dotdict(json.loads(open(args.original_args).readlines()[0])) | |
if "hf_model" not in train_args: | |
train_args["hf_model"] = None | |
# Load Models # | |
if train_args.hf_model: | |
tango = Tango(train_args.hf_model, "cpu") | |
vae, stft, model = tango.vae.cuda(), tango.stft.cuda(), tango.model.cuda() | |
else: | |
name = "audioldm-s-full" | |
vae, stft = build_pretrained_models(name) | |
vae, stft = vae.cuda(), stft.cuda() | |
model = AudioDiffusion( | |
train_args.text_encoder_name, train_args.scheduler_name, train_args.unet_model_name, train_args.unet_model_config, train_args.snr_gamma | |
).cuda() | |
model.eval() | |
# Load Trained Weight # | |
device = vae.device() | |
model.load_state_dict(torch.load(args.model)) | |
scheduler = DDPMScheduler.from_pretrained(train_args.scheduler_name, subfolder="scheduler") | |
evaluator = EvaluationHelper(16000, "cuda:0") | |
wandb.init(project="Text to Audio Diffusion Evaluation") | |
# Load Data # | |
if train_args.prefix: | |
prefix = train_args.prefix | |
else: | |
prefix = "" | |
text_prompts = [json.loads(line)[args.text_key] for line in open(args.test_file).readlines()] | |
text_prompts = [prefix + inp for inp in text_prompts] | |
# Generate # | |
num_steps, guidance, batch_size, num_samples = args.num_steps, args.guidance, args.batch_size, args.num_samples | |
all_outputs = [] | |
for k in tqdm(range(0, len(text_prompts), batch_size)): | |
text = text_prompts[k: k+batch_size] | |
with torch.no_grad(): | |
latents = model.inference(text, scheduler, num_steps, guidance, num_samples, disable_progress=True) | |
mel = vae.decode_first_stage(latents) | |
wave = vae.decode_to_waveform(mel) | |
all_outputs += [item for item in wave] | |
# Save # | |
exp_id = str(int(time.time())) | |
if not os.path.exists("outputs"): | |
os.makedirs("outputs") | |
output_dir = "outputs/{}_{}_steps_{}_guidance_{}".format(exp_id, "_".join(args.model.split("/")[1:-1]), num_steps, guidance) | |
os.makedirs(output_dir, exist_ok=True) | |
for j, wav in enumerate(all_outputs): | |
sf.write("{}/output_{}.wav".format(output_dir, j), wav, samplerate=16000) | |
clap_score = clap_score_computation(output_dir,text_prompts) | |
result = evaluator.main(output_dir, args.test_references) | |
result["Steps"] = num_steps | |
result["Guidance Scale"] = guidance | |
result["Test Instances"] = len(text_prompts) | |
result["Clap Score"] = np.round(clap_score,2) | |
wandb.log(result) | |
result["scheduler_config"] = dict(scheduler.config) | |
result["args"] = dict(vars(args)) | |
result["output_dir"] = output_dir | |
with open("outputs/summary.jsonl", "a") as f: | |
f.write(json.dumps(result) + "\n\n") | |
if __name__ == "__main__": | |
main() |