toolbox-tts / app.py
kennethli319's picture
update tts
462cad0
raw
history blame
1.51 kB
import gradio as gr
import torch
import torchaudio
import tempfile
import numpy as np
from nemo.collections.tts.models import FastPitchModel
from nemo.collections.tts.models import HifiGanModel
from nemo.collections.tts.models import MixerTTSModel
# spec_generator_2 = MixerTTSModel.from_pretrained("tts_en_lj_mixerttsx")
# model1 = HifiGanModel.from_pretrained(model_name="tts_en_lj_hifigan_ft_mixerttsx")
spec_generator = FastPitchModel.from_pretrained("tts_en_fastpitch_multispeaker")
spec_generator.eval()
voc_model = HifiGanModel.from_pretrained(model_name="tts_en_hifitts_hifigan_ft_fastpitch")
voc_model.eval()
def greet(name):
return "Hello " + name + "!!"
def generate_tts(text: str, speaker: int = 0):
sr = 44100
parsed = spec_generator.parse(text)
spectrogram = spec_generator.generate_spectrogram(tokens=parsed, speaker=speaker)
audio = voc_model.convert_spectrogram_to_audio(spec=spectrogram)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
torchaudio.save(fp.name, audio.to('cpu'), sample_rate=sr)
return fp.name
#return (sr, audio.to('cpu').detach().numpy())
def run():
demo = gr.Interface(
fn=generate_tts,
inputs=[gr.Textbox(value="This is a test.", label="Text to Synthesize"),
gr.Slider(0, 10, step=1, label="Speaker")],
outputs=gr.Audio(label="Output", type="filepath"),
)
demo.launch(server_name="0.0.0.0", server_port=7860)
if __name__ == "__main__":
run()