FireRedTTS / app.py
FireRedTeam's picture
Update app.py
7b98f12 verified
raw
history blame contribute delete
2.14 kB
import gradio as gr
import numpy as np
import os
import requests
import spaces
from fireredtts.fireredtts import FireRedTTS
def download_file(url, filename):
response = requests.get(url)
if response.status_code == 200:
with open(filename, 'wb') as file:
file.write(response.content)
print(f"File downloaded successfully: {filename}")
else:
print(f"Failed to download file: HTTP {response.status_code}")
if not os.path.exists('pretrained_models/fireredtts_gpt.pt'):
print("Start to download checkpoints...")
download_file('https://huggingface.co/fireredteam/FireRedTTS/resolve/main/fireredtts_gpt.pt',
'pretrained_models/fireredtts_gpt.pt')
download_file('https://huggingface.co/fireredteam/FireRedTTS/resolve/main/fireredtts_speaker.bin',
'pretrained_models/fireredtts_speaker.bin')
download_file('https://huggingface.co/fireredteam/FireRedTTS/resolve/main/fireredtts_token2wav.pt',
'pretrained_models/fireredtts_token2wav.pt')
sampling_rate = 24000
tts = FireRedTTS(
config_path="configs/config_24k.json",
pretrained_path='pretrained_models',
)
@spaces.GPU
def tts_inference(text, prompt_wav='examples/prompt_1.wav', lang='zh'):
# Model inference
syn_audio = tts.synthesize(
prompt_wav=prompt_wav,
text=text,
lang=lang,
)[0].detach().cpu().numpy()
# Normalize volume
syn_audio = syn_audio / np.max(np.abs(syn_audio)) * 0.9
# Convert audio data type
syn_audio = (syn_audio * 32768).astype(np.int16)
return sampling_rate, syn_audio
iface = gr.Interface(
fn=tts_inference,
inputs=[
gr.Textbox(label="Input text here"),
gr.Audio(type="filepath", label="Upload reference audio"),
gr.Dropdown(["en", "zh"], label="Select language"),
],
outputs=gr.Audio(label="Generated audio"),
title="FireRedTTS: A Foundation Text-To-Speech Framework for Industry-Level Generative Speech Applications",
# description="Enter some text and listen to the generated speech."
)
if __name__ == "__main__":
iface.launch()