File size: 3,809 Bytes
cb8ade1 bd535b8 70f1ffe 674b3b5 70f1ffe 67b8718 d7cdc95 a7d3b77 8a8ed83 44e247f 7c47766 90e39db e33859f e8c527a 727bfb7 85c877d 7c47766 6514433 7023739 cb8ade1 9b18d73 7023739 5df73fe 674b3b5 67b8718 d7cdc95 a7d3b77 8a8ed83 44e247f 7c47766 90e39db e33859f e8c527a 727bfb7 85c877d 7c47766 9b18d73 7023739 e8233e7 9818632 e8233e7 9b18d73 52a5324 3e59502 876cdf7 e8233e7 876cdf7 21e6702 c647bb3 ef79c00 876cdf7 7fcf913 c647bb3 7023739 e8233e7 4de77e2 e8233e7 07e5d81 e8233e7 7023739 3e59502 cb8ade1 837762b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import gradio as gr
from diffusers import AudioLDMPipeline
# The recommended "fast" scheduler:
#from diffusers import DPMSolverMultistepScheduler
# The Default AudioLDM scheduler:
#from diffusers import DDIMScheduler
#from diffusers import DDPMScheduler
#from diffusers import DEISMultistepScheduler
#from diffusers import DPMSolverSinglestepScheduler
#from diffusers import HeunDiscreteScheduler
from diffusers import KDPM2DiscreteScheduler
#from diffusers import KDPM2AncestralDiscreteScheduler
#from diffusers import LMSDiscreteScheduler
#from diffusers import PNDMScheduler
#from diffusers import EulerDiscreteScheduler
#from diffusers import EulerAncestralDiscreteScheduler
#from diffusers import UniPCMultistepScheduler
from transformers import AutoProcessor, ClapModel
import torch
# import scipy
device="cpu"
repo_id = "cvssp/audioldm-s-full-v2"
pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float32)
#pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = DDPMScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = DEISMultistepScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = DPMSolverSinglestepScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = HeunDiscreteScheduler.from_config(pipe.scheduler.config)
pipe.scheduler = KDPM2DiscreteScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = KDPM2AncestralDiscreteScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to(device)
clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device)
processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full", sampling_rate=16000)
generator = torch.Generator(device)
def texttoaudio(prompt, neg_prompt, seed, inf_steps, guidance_scale, n_candidates):
if prompt is None:
raise gr.Error("Please provide a text input.")
waveforms = pipe(
prompt,
negative_prompt=neg_prompt,
num_inference_steps=int(inf_steps),
guidance_scale=guidance_scale,
audio_length_in_s=5.0,
generator=generator.manual_seed(int(seed)),
num_waveforms_per_prompt=int(n_candidates) if n_candidates else 1,
)["audios"]
# save the audio sample as a .wav file
# scipy.io.wavfile.write("output.wav", rate=16000, data=audio)
if waveforms.shape[0] > 1:
waveform = score_waveforms(prompt, waveforms)
else:
waveform = waveforms[0]
return (16000, waveform)
def score_waveforms(text, waveforms):
inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True, sampling_rate=16000)
inputs = {key: inputs[key].to(device) for key in inputs}
with torch.no_grad():
logits_per_text = clap_model(**inputs).logits_per_text # this is the audio-text similarity score
probs = logits_per_text.softmax(dim=-1) # we can take the softmax to get the label probabilities
most_probable = torch.argmax(probs) # and now select the most likely audio waveform
waveform = waveforms[most_probable]
return waveform
iface = gr.Interface(fn=texttoaudio, title="AudioLDM Testing Playground", inputs=["text", "text", "number", "number", "number", "number"], outputs="audio")
iface.launch() |