Spaces:
Running
Running
Michael Hu
commited on
Commit
·
933cc7f
1
Parent(s):
2d46a24
tts is deprecated, use fish speech
Browse files- app.py +1 -1
- config/tts_config.yaml +13 -0
- download_models.py +5 -0
- requirements.txt +3 -6
- utils/tts.py +45 -37
app.py
CHANGED
@@ -55,7 +55,7 @@ def handle_file_processing(upload_path):
|
|
55 |
|
56 |
# TTS Phase
|
57 |
status_text.markdown("🎵 **Generating Chinese Speech...**")
|
58 |
-
output_path = generate_speech(chinese_text)
|
59 |
progress_bar.progress(100)
|
60 |
|
61 |
# Display results
|
|
|
55 |
|
56 |
# TTS Phase
|
57 |
status_text.markdown("🎵 **Generating Chinese Speech...**")
|
58 |
+
output_path = generate_speech(chinese_text,language="zh")
|
59 |
progress_bar.progress(100)
|
60 |
|
61 |
# Display results
|
config/tts_config.yaml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tts:
|
2 |
+
model: fish-speech-400m-v1
|
3 |
+
vocoder: hifigan-v1
|
4 |
+
device: auto
|
5 |
+
precision: fp16
|
6 |
+
|
7 |
+
generation:
|
8 |
+
temperature: 0.7
|
9 |
+
top_k: 20
|
10 |
+
max_length: 4096
|
11 |
+
language_mapping:
|
12 |
+
zh: "[ZH]{text}[ZH]"
|
13 |
+
en: "[EN]{text}[EN]"
|
download_models.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fish_audio.sdk.utils import download_all_models
|
2 |
+
|
3 |
+
if __name__ == "__main__":
|
4 |
+
download_all_models()
|
5 |
+
print("All models downloaded to ~/.cache/fish_audio")
|
requirements.txt
CHANGED
@@ -4,10 +4,7 @@ nltk>=3.8
|
|
4 |
librosa>=0.10
|
5 |
soundfile>=0.12
|
6 |
ffmpeg-python>=0.2
|
7 |
-
torch>=2.0,<3.0
|
8 |
transformers[audio]>=4.33
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
scipy~=1.11.0 # 与 NumPy 1.x 兼容的 SciPy 版本
|
13 |
-
scikit-learn~=1.3.0 # 兼容旧版 NumPy 的 scikit-learn
|
|
|
4 |
librosa>=0.10
|
5 |
soundfile>=0.12
|
6 |
ffmpeg-python>=0.2
|
|
|
7 |
transformers[audio]>=4.33
|
8 |
+
fish-audio-sdk>=0.0.7
|
9 |
+
torch>=2.1.0
|
10 |
+
torchaudio>=2.1.0
|
|
|
|
utils/tts.py
CHANGED
@@ -1,46 +1,54 @@
|
|
1 |
-
"""
|
2 |
-
Text-to-Speech Module using YourTTS
|
3 |
-
Handles speech synthesis and output generation
|
4 |
-
"""
|
5 |
-
|
6 |
-
from TTS.api import TTS
|
7 |
-
import os
|
8 |
import time
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
"""
|
18 |
-
|
19 |
-
tts
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
-
#
|
26 |
-
|
27 |
-
|
28 |
-
|
|
|
|
|
29 |
)
|
30 |
|
31 |
-
#
|
32 |
-
|
33 |
-
"assets/reference_voice.wav"
|
34 |
-
if os.path.exists("assets/reference_voice.wav")
|
35 |
-
else None
|
36 |
-
)
|
37 |
|
38 |
-
#
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
)
|
45 |
|
|
|
|
|
|
|
|
|
46 |
return output_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import time
|
2 |
+
import yaml
|
3 |
+
from pathlib import Path
|
4 |
+
import torch
|
5 |
+
from fish_audio.sdk import TextToSpeech, Vocoder
|
6 |
+
from pydub import AudioSegment
|
7 |
|
8 |
+
# Load config
|
9 |
+
config_path = Path(__file__).parent.parent / "config" / "tts_config.yaml"
|
10 |
+
with open(config_path) as f:
|
11 |
+
config = yaml.safe_load(f)
|
12 |
+
|
13 |
+
# Initialize models
|
14 |
+
tts_model = TextToSpeech(
|
15 |
+
model_name=config["tts"]["model"],
|
16 |
+
device=config["tts"]["device"],
|
17 |
+
precision=config["tts"]["precision"],
|
18 |
+
)
|
19 |
+
|
20 |
+
vocoder = Vocoder(
|
21 |
+
model_name=config["tts"]["vocoder"],
|
22 |
+
device=tts_model.device,
|
23 |
+
)
|
24 |
+
|
25 |
+
def generate_speech(text: str, language: str = "zh") -> str:
|
26 |
+
"""Generate speech from text using Fish Audio SDK"""
|
27 |
+
# Format text with language tags
|
28 |
+
lang_template = config["generation"]["language_mapping"][language]
|
29 |
+
processed_text = lang_template.format(text=text)
|
30 |
|
31 |
+
# Generate mel spectrogram
|
32 |
+
mel = tts_model.generate(
|
33 |
+
text=processed_text,
|
34 |
+
temperature=config["generation"]["temperature"],
|
35 |
+
top_k=config["generation"]["top_k"],
|
36 |
+
max_length=config["generation"]["max_length"],
|
37 |
)
|
38 |
|
39 |
+
# Convert mel to waveform
|
40 |
+
waveform = vocoder(mel)
|
|
|
|
|
|
|
|
|
41 |
|
42 |
+
# Create audio segment
|
43 |
+
audio = AudioSegment(
|
44 |
+
waveform.numpy().tobytes(),
|
45 |
+
frame_rate=vocoder.sample_rate,
|
46 |
+
sample_width=2,
|
47 |
+
channels=1,
|
48 |
)
|
49 |
|
50 |
+
# Save output
|
51 |
+
output_path = f"temp/outputs/output_{int(time.time())}.wav"
|
52 |
+
audio.export(output_path, format="wav")
|
53 |
+
|
54 |
return output_path
|