File size: 3,809 Bytes
cb8ade1
 
bd535b8
70f1ffe
674b3b5
70f1ffe
67b8718
d7cdc95
a7d3b77
8a8ed83
44e247f
7c47766
90e39db
e33859f
e8c527a
727bfb7
85c877d
7c47766
6514433
 
 
7023739
 
cb8ade1
9b18d73
 
7023739
5df73fe
674b3b5
67b8718
d7cdc95
a7d3b77
8a8ed83
 
44e247f
7c47766
90e39db
e33859f
e8c527a
727bfb7
85c877d
7c47766
9b18d73
7023739
e8233e7
9818632
e8233e7
9b18d73
52a5324
3e59502
876cdf7
 
 
e8233e7
876cdf7
21e6702
c647bb3
ef79c00
876cdf7
 
7fcf913
c647bb3
7023739
 
 
e8233e7
4de77e2
e8233e7
 
 
 
 
 
07e5d81
e8233e7
 
 
 
 
 
 
7023739
3e59502
cb8ade1
837762b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import gradio as gr

from diffusers import AudioLDMPipeline
# The recommended "fast" scheduler:
#from diffusers import DPMSolverMultistepScheduler
# The Default AudioLDM scheduler:
#from diffusers import DDIMScheduler
#from diffusers import DDPMScheduler
#from diffusers import DEISMultistepScheduler
#from diffusers import DPMSolverSinglestepScheduler
#from diffusers import HeunDiscreteScheduler
from diffusers import KDPM2DiscreteScheduler
#from diffusers import KDPM2AncestralDiscreteScheduler
#from diffusers import LMSDiscreteScheduler
#from diffusers import PNDMScheduler
#from diffusers import EulerDiscreteScheduler
#from diffusers import EulerAncestralDiscreteScheduler
#from diffusers import UniPCMultistepScheduler

from transformers import AutoProcessor, ClapModel

import torch
# import scipy

device="cpu"

repo_id = "cvssp/audioldm-s-full-v2"
pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float32)
#pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = DDPMScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = DEISMultistepScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = DPMSolverSinglestepScheduler.from_config(pipe.scheduler.config)

#pipe.scheduler = HeunDiscreteScheduler.from_config(pipe.scheduler.config)
pipe.scheduler = KDPM2DiscreteScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = KDPM2AncestralDiscreteScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
#pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to(device)

clap_model = ClapModel.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full").to(device)
processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-full", sampling_rate=16000)

generator = torch.Generator(device)

def texttoaudio(prompt, neg_prompt, seed, inf_steps, guidance_scale, n_candidates):
    if prompt is None:
        raise gr.Error("Please provide a text input.")
        
    waveforms = pipe(
                 prompt, 
                 negative_prompt=neg_prompt,
                 num_inference_steps=int(inf_steps),
                 guidance_scale=guidance_scale,
                 audio_length_in_s=5.0,
                 generator=generator.manual_seed(int(seed)),
                 num_waveforms_per_prompt=int(n_candidates) if n_candidates else 1,
                )["audios"]

    # save the audio sample as a .wav file
    # scipy.io.wavfile.write("output.wav", rate=16000, data=audio)
    if waveforms.shape[0] > 1:
        waveform = score_waveforms(prompt, waveforms)
    else:
        waveform = waveforms[0]

    return (16000, waveform)

def score_waveforms(text, waveforms):
    inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True, sampling_rate=16000)
    inputs = {key: inputs[key].to(device) for key in inputs}
    with torch.no_grad():
        logits_per_text = clap_model(**inputs).logits_per_text  # this is the audio-text similarity score
        probs = logits_per_text.softmax(dim=-1)  # we can take the softmax to get the label probabilities
        most_probable = torch.argmax(probs)  # and now select the most likely audio waveform
    waveform = waveforms[most_probable]
    return waveform

iface = gr.Interface(fn=texttoaudio, title="AudioLDM Testing Playground", inputs=["text", "text", "number", "number", "number", "number"], outputs="audio")

iface.launch()