Spaces:
Runtime error
Runtime error
File size: 7,933 Bytes
c7ff3c2 e357617 c7ff3c2 e357617 bb2b924 e357617 c7ff3c2 e357617 bb2b924 c7ff3c2 e357617 c7ff3c2 e357617 c7ff3c2 e357617 c7ff3c2 e357617 c7ff3c2 e357617 c7ff3c2 e357617 c7ff3c2 e357617 c7ff3c2 e357617 bb2b924 e357617 bb2b924 e357617 c7ff3c2 e357617 c7ff3c2 e357617 c7ff3c2 e357617 c7ff3c2 e357617 c7ff3c2 e357617 c7ff3c2 e357617 c7ff3c2 e357617 fa5c867 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
import spaces
import gradio as gr
import torch
from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
from string import punctuation
import re
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
from functools import lru_cache
from torch.cuda.amp import autocast
import time
device = "cuda:0" if torch.cuda.is_available() else "cpu"
repo_id = "parler-tts/parler-tts-mini-expresso"
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.float16).to(device)
model = torch.compile(model) # Adiciona otimização com torch.compile
tokenizer = AutoTokenizer.from_pretrained(repo_id)
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
SAMPLE_RATE = feature_extractor.sampling_rate
SEED = 42
default_text = "*Remember* - this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of *five times*."
default_description = "Thomas speaks with emphasis and excitement at a moderate pace with high quality."
examples = [
[
"Remember - this is only the first iteration of the model. To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.",
"Thomas speaks in a sad tone at a moderate pace with high quality."
],
[
"Did you know? You can reproduce this entire training recipe by following the steps outlined on the model card!",
"Talia speaks quickly with excitement and high quality audio.",
],
[
"But that's no secret! The entire project is open source first, with all release artefacts on the Hub.",
"Elisabeth speaks happily at a slightly slower than average pace with high quality audio.",
],
[
"Hey there! I'm Jerry. Or at least I think I am? I just need to check that quickly.",
"Jerry speaks in a confused tone at a moderately slow pace with high quality audio.",
],
[
"<laugh> It can even laugh! Do you believe it ? I don't!",
"Talia speaks with laughter with high quality.",
],
]
number_normalizer = EnglishNumberNormalizer()
def preprocess(text):
text = number_normalizer(text).strip()
if text[-1] not in punctuation:
text = f"{text}."
abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
def separate_abb(chunk):
chunk = chunk.replace(".", "")
print(chunk)
return " ".join(chunk)
abbreviations = re.findall(abbreviations_pattern, text)
for abv in abbreviations:
if abv in text:
text = text.replace(abv, separate_abb(abv))
return text
@lru_cache(maxsize=128)
def cached_tokenizer(text):
return tokenizer(text, return_tensors="pt").to(device)
@spaces.GPU
def gen_tts(text, description):
start_time = time.time()
with torch.no_grad(): # Desativa gradientes
inputs = cached_tokenizer(description)
prompt = cached_tokenizer(preprocess(text))
set_seed(SEED)
with autocast(): # Habilita precisão mista
generation = model.generate(
input_ids=inputs.input_ids,
prompt_input_ids=prompt.input_ids,
max_length=200, # Limita o comprimento máximo da saída
num_beams=3 # Usa beam search com 3 feixes
)
audio_arr = generation.cpu().numpy().squeeze()
end_time = time.time()
print(f"Generation completed in {end_time - start_time:.2f} seconds")
return SAMPLE_RATE, audio_arr
css = """
#share-btn-container {
display: flex;
padding-left: 0.5rem !important;
padding-right: 0.5rem !important;
background-color: #000000;
justify-content: center;
align-items: center;
border-radius: 9999px !important;
width: 13rem;
margin-top: 10px;
margin-left: auto;
flex: unset !important;
}
#share-btn {
all: initial;
color: #ffffff;
font-weight: 600;
cursor: pointer;
font-family: 'IBM Plex Sans', sans-serif;
margin-left: 0.5rem !important;
padding-top: 0.25rem !important;
padding-bottom: 0.25rem !important;
right:0;
}
#share-btn * {
all: unset !important;
}
#share-btn-container div:nth-child(-n+2){
width: auto !important;
min-height: 0px !important;
}
#share-btn-container .wrap {
display: none !important;
}
"""
with gr.Blocks(css=css) as block:
gr.HTML(
"""
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
<div
style="
display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
"
>
<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
Parler-TTS: Expresso ☕️️
</h1>
</div>
</div>
"""
)
gr.HTML(
f"""
<p><a href="https://huggingface.co/parler-tts/parler-tts-mini-expresso"> Parler-TTS Mini: Expresso</a>
is a text-to-speech (TTS) model fine-tuned on the <a href="https://huggingface.co/datasets/ylacombe/expresso"> Expresso dataset</a>.
It generates high-quality speech in a given <b>emotion</b> and <b>voice</b> that can be controlled through a simple text prompt.</p>
<p>Tips for ensuring good generation:
<ul>
<li>Specify the name of a male speaker (Jerry, Thomas) or female speaker (Talia, Elisabeth) for consistent voices</li>
<li>The model can generate in a range of emotions, including: "happy", "confused", "default" (meaning no particular emotion conveyed), "laughing", "sad", "whisper", "emphasis"</li>
<li>Punctuation can be used to control the prosody of the generations, e.g. use commas to add small breaks in speech</li>
<li>To emphasise particular words, wrap them in asterisk (e.g. *you* in the example above) and include "emphasis" in the prompt</li>
</ul>
</p>
"""
)
with gr.Row():
with gr.Column():
input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
description = gr.Textbox(label="Description", lines=2, value=default_description, elem_id="input_description")
run_button = gr.Button("Generate Audio", variant="primary")
with gr.Column():
audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")
inputs = [input_text, description]
outputs = [audio_out]
gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)
run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
gr.HTML(
"""
<p>To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data to 50k hours of speech.
The v1 release of the model will be trained on this data, as well as inference optimisations, such as flash attention
and torch compile, that will improve the latency by 2-4x. If you want to find out more about how this model was trained and even fine-tune it yourself, check-out the
<a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a> repository on GitHub. The Parler-TTS codebase and its associated checkpoints are licensed under <a href='https://github.com/huggingface/parler-tts?tab=Apache-2.0-1-ov-file#readme'> Apache 2.0</a>.</p>
"""
)
block.queue()
block.launch(share=True) |