Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,214 Bytes
02c7bdf bdaf47a 02c7bdf 7b02833 02c7bdf a1655f3 02c7bdf 6e4d760 341eb54 6e4d760 75a5cbb 341eb54 6e4d760 e537531 e63a812 3e28721 e63a812 a9e592e 132a2a9 e63a812 02c7bdf 3192961 6e4d760 b78b7d0 e805751 341eb54 b78b7d0 e805751 b78b7d0 9f61737 6e4d760 8310825 43e8301 b78b7d0 dc8fb4a 9ec5c63 90fa92b dc8fb4a b78b7d0 99710ec 863f583 49effbd 0f8dddd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import torch
import soundfile as sf
import gradio as gr
import spaces
from clearvoice import ClearVoice
import os
@spaces.GPU
def fn_clearvoice_sr(input_wav, apply_se):
myClearVoice = ClearVoice(task='speech_super_resolution', model_names=['MossFormer2_SR_48K'])
fs = 48000
if apply_se:
myClearVoice_se = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
output_wav_dict = myClearVoice_se(input_path=input_wav, online_write=True, output_path='enhanced.wav')
input_wav = 'enhanced.wav'
output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
if isinstance(output_wav_dict, dict):
key = next(iter(output_wav_dict))
output_wav = output_wav_dict[key]
else:
output_wav = output_wav_dict
sf.write('enhanced_high_res.wav', output_wav[0,:], fs)
return 'enhanced_high_res.wav'
@spaces.GPU
def fn_clearvoice_se(input_wav, sr):
if sr == "16000 Hz":
myClearVoice = ClearVoice(task='speech_enhancement', model_names=['FRCRN_SE_16K'])
fs = 16000
else:
myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
fs = 48000
output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
if isinstance(output_wav_dict, dict):
key = next(iter(output_wav_dict))
output_wav = output_wav_dict[key]
else:
output_wav = output_wav_dict
sf.write('enhanced.wav', output_wav, fs)
return 'enhanced.wav'
demo = gr.Blocks()
sr_demo = gr.Interface(
fn=fn_clearvoice_sr,
inputs = [
gr.Audio(label="Input Audio", type="filepath"),
gr.Checkbox(["Apply Speech Enhancement"], label="Apply Speech Enhancement"),
],
outputs = [
gr.Audio(label="Output Audio", type="filepath"),
],
title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Speech Super Resolution",
description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and transform low-resolution audio (effective sampling rate ≥ 16 kHz) into crystal-clear, high-resolution audio at 48 kHz. It supports most of audio types. "
"To try it, simply upload your audio, or click one of the examples. "),
article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> </p>"
"<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
examples = [
["examples/mandarin_speech_16kHz.wav", "16000 Hz"],
["examples/LJSpeech-001-0001-22k.wav", "22050 Hz"],
["examples/LibriTTS_986_129388_24k.wav", "24000 Hz"],
["examples/english_speech_48kHz.wav", "48000 Hz"],
],
cache_examples = True,
)
with demo:
gr.TabbedInterface([sr_demo], ["Task 4: Speech Super Resolution"])
demo.launch() |