|
import torch |
|
import gradio as gr |
|
from transformers import ViTImageProcessor, ViTModel |
|
from audiodiffusion import AudioDiffusionPipeline, ImageEncoder |
|
from pedalboard.io import AudioFile |
|
from pedalboard import Pedalboard, NoiseGate, Compressor, LowShelfFilter, Gain, HighShelfFilter, Reverb |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
generator1 = torch.Generator(device) |
|
generator2 = torch.Generator(device) |
|
|
|
pipe = AudioDiffusionPipeline.from_pretrained('Woleek/clMusDiff').to(device) |
|
|
|
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k') |
|
extractor = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k') |
|
image_encoder = ImageEncoder(processor, extractor) |
|
|
|
board = Pedalboard([ |
|
NoiseGate(threshold_db=-60, ratio=10.0), |
|
Compressor(threshold_db=60, ratio=1.0), |
|
LowShelfFilter(cutoff_frequency_hz=220, gain_db=-10), |
|
HighShelfFilter(cutoff_frequency_hz=1200, gain_db=-10), |
|
Gain(gain_db=40), |
|
Reverb(room_size=0.5), |
|
|
|
]) |
|
|
|
def _encode_image(image): |
|
return torch.unsqueeze(image_encoder.encode(image), axis=1).to(device) |
|
|
|
def _generate_spectrogram(condition, steps, eta): |
|
images, (sample_rate, audios) = pipe( |
|
batch_size=1, |
|
steps=steps, |
|
generator=generator1, |
|
step_generator=generator2, |
|
encoding=condition, |
|
eta=eta, |
|
return_dict=False, |
|
) |
|
return images[0], (sample_rate, audios[0]) |
|
|
|
def _denoise_audio(audio, sr): |
|
return board(audio, sr) |
|
|
|
def run_generation(image, steps, eta): |
|
condition = _encode_image(image) |
|
spectrogram, (sr, audio) = _generate_spectrogram(condition, steps, eta) |
|
audio = _denoise_audio(audio, sr) |
|
return spectrogram, (sr, audio) |
|
|
|
with gr.Blocks(title="Image-based soundtrack generation") as demo: |
|
gr.Markdown(''' |
|
# Image-based soundtrack generation |
|
''') |
|
with gr.Row(): |
|
with gr.Column(): |
|
image = gr.Image( |
|
type="pil", |
|
label="Conditioning image" |
|
) |
|
steps = gr.Slider( |
|
minimum=10, |
|
maximum=1000, |
|
step=10, |
|
value=50, |
|
label="Denoising steps" |
|
) |
|
eta = gr.Slider( |
|
minimum=0.0, |
|
maximum=1.0, |
|
step=0.1, |
|
value=0.6, |
|
label="η" |
|
) |
|
gr.Markdown(''' |
|
Eta (η) is a variable that controls the level of interpolation between deterministic (η=0.0) and stochastic (η=1.0) denoising schedule. |
|
''') |
|
btn = gr.Button("Generate") |
|
clear = gr.ClearButton(image) |
|
with gr.Column(): |
|
spectrogram = gr.Image( |
|
label="Generated Mel spectrogram" |
|
) |
|
audio = gr.Audio( |
|
label="Resulting audio" |
|
) |
|
btn.click(run_generation, inputs=[image, steps, eta], outputs=[spectrogram, audio]) |
|
|
|
demo.launch() |