File size: 2,995 Bytes
c4e7950
 
 
 
f5f5100
 
c4e7950
 
 
 
 
 
 
 
 
 
 
f5f5100
 
 
 
 
 
 
 
 
 
c4e7950
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5f5100
 
 
c4e7950
 
 
f5f5100
c4e7950
 
 
 
 
 
 
 
 
 
 
 
 
f5f5100
c4e7950
f5f5100
c4e7950
 
 
 
f5f5100
c4e7950
 
f5f5100
c4e7950
 
 
f5f5100
c4e7950
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import torch
import gradio as gr
from transformers import ViTImageProcessor, ViTModel
from audiodiffusion import AudioDiffusionPipeline, ImageEncoder
from pedalboard.io import AudioFile
from pedalboard import Pedalboard, NoiseGate, Compressor, LowShelfFilter, Gain, HighShelfFilter, Reverb

device = "cuda" if torch.cuda.is_available() else "cpu"
generator1 = torch.Generator(device)
generator2 = torch.Generator(device)
    
pipe = AudioDiffusionPipeline.from_pretrained('Woleek/clMusDiff').to(device)

processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
extractor = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
image_encoder = ImageEncoder(processor, extractor)

board = Pedalboard([
    NoiseGate(threshold_db=-60, ratio=10.0),
    Compressor(threshold_db=60, ratio=1.0),
    LowShelfFilter(cutoff_frequency_hz=220, gain_db=-10),
    HighShelfFilter(cutoff_frequency_hz=1200, gain_db=-10),
    Gain(gain_db=40),
    Reverb(room_size=0.5),
    
])

def _encode_image(image):
    return torch.unsqueeze(image_encoder.encode(image), axis=1).to(device)

def _generate_spectrogram(condition, steps, eta):
    images, (sample_rate, audios) = pipe(
        batch_size=1,
        steps=steps,
        generator=generator1,
        step_generator=generator2,
        encoding=condition, 
        eta=eta,
        return_dict=False,
    )
    return images[0], (sample_rate, audios[0])

def _denoise_audio(audio, sr):
    return board(audio, sr)

def run_generation(image, steps, eta):
    condition = _encode_image(image)
    spectrogram, (sr, audio) = _generate_spectrogram(condition, steps, eta)
    audio = _denoise_audio(audio, sr)
    return spectrogram, (sr, audio)

with gr.Blocks(title="Image-based soundtrack generation") as demo:
    gr.Markdown('''
        # Image-based soundtrack generation
    ''')
    with gr.Row():
        with gr.Column():
            image = gr.Image(
                type="pil", 
                label="Conditioning image"
            )
            steps = gr.Slider(
                minimum=10, 
                maximum=1000, 
                step=10, 
                value=50,
                label="Denoising steps"
            )
            eta = gr.Slider(
                minimum=0.0, 
                maximum=1.0, 
                step=0.1, 
                value=0.6,
                label="η"
            )
            gr.Markdown('''
                Eta (η) is a variable that controls the level of interpolation between deterministic (η=0.0) and stochastic (η=1.0) denoising schedule.
            ''')
            btn = gr.Button("Generate")
            clear = gr.ClearButton(image)
        with gr.Column():
            spectrogram = gr.Image(
                label="Generated Mel spectrogram"
            )
            audio = gr.Audio(
                label="Resulting audio"
            )
    btn.click(run_generation, inputs=[image, steps, eta], outputs=[spectrogram, audio])
    
demo.launch()