File size: 3,214 Bytes
02c7bdf
bdaf47a
02c7bdf
7b02833
02c7bdf
a1655f3
02c7bdf
6e4d760
341eb54
 
 
 
 
 
 
 
6e4d760
 
 
 
 
75a5cbb
 
341eb54
6e4d760
e537531
e63a812
3e28721
e63a812
 
 
 
 
a9e592e
132a2a9
 
 
 
 
e63a812
02c7bdf
3192961
 
 
6e4d760
 
b78b7d0
e805751
341eb54
b78b7d0
 
e805751
b78b7d0
9f61737
6e4d760
8310825
43e8301
 
b78b7d0
dc8fb4a
9ec5c63
90fa92b
dc8fb4a
b78b7d0
 
 
 
99710ec
863f583
49effbd
0f8dddd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import torch
import soundfile as sf
import gradio as gr
import spaces
from clearvoice import ClearVoice
import os

@spaces.GPU
def fn_clearvoice_sr(input_wav, apply_se):

    myClearVoice = ClearVoice(task='speech_super_resolution', model_names=['MossFormer2_SR_48K'])
    fs = 48000
    if apply_se:
        myClearVoice_se = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
        output_wav_dict = myClearVoice_se(input_path=input_wav, online_write=True, output_path='enhanced.wav')
        input_wav = 'enhanced.wav'
    output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
    if isinstance(output_wav_dict, dict):
        key = next(iter(output_wav_dict))
        output_wav = output_wav_dict[key]
    else:
        output_wav = output_wav_dict
    sf.write('enhanced_high_res.wav', output_wav[0,:], fs)
    return 'enhanced_high_res.wav'
    
@spaces.GPU
def fn_clearvoice_se(input_wav, sr):
    if sr == "16000 Hz":
        myClearVoice = ClearVoice(task='speech_enhancement', model_names=['FRCRN_SE_16K'])
        fs = 16000
    else:
        myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
        fs = 48000
    output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
    if isinstance(output_wav_dict, dict):
        key = next(iter(output_wav_dict))
        output_wav = output_wav_dict[key]
    else:
        output_wav = output_wav_dict
    sf.write('enhanced.wav', output_wav, fs)
    return 'enhanced.wav'

demo = gr.Blocks()

sr_demo = gr.Interface(
    fn=fn_clearvoice_sr,
    inputs = [
        gr.Audio(label="Input Audio", type="filepath"),
        gr.Checkbox(["Apply Speech Enhancement"], label="Apply Speech Enhancement"),
    ],
    outputs = [
        gr.Audio(label="Output Audio", type="filepath"),
    ],
    title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Speech Super Resolution",
    description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and transform low-resolution audio (effective sampling rate ≥ 16 kHz) into crystal-clear, high-resolution audio at 48 kHz. It supports most of audio types. "
                   "To try it, simply upload your audio, or click one of the examples. "),
    article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> </p>"
              "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
    examples = [
        ["examples/mandarin_speech_16kHz.wav", "16000 Hz"],
        ["examples/LJSpeech-001-0001-22k.wav", "22050 Hz"],
        ["examples/LibriTTS_986_129388_24k.wav", "24000 Hz"],
        ["examples/english_speech_48kHz.wav", "48000 Hz"],
    ],
    cache_examples = True,
)

with demo:
    gr.TabbedInterface([sr_demo], ["Task 4: Speech Super Resolution"])

demo.launch()