Michael Hu commited on
Commit
933cc7f
·
1 Parent(s): 2d46a24

tts is deprecated, use fish speech

Browse files
Files changed (5) hide show
  1. app.py +1 -1
  2. config/tts_config.yaml +13 -0
  3. download_models.py +5 -0
  4. requirements.txt +3 -6
  5. utils/tts.py +45 -37
app.py CHANGED
@@ -55,7 +55,7 @@ def handle_file_processing(upload_path):
55
 
56
  # TTS Phase
57
  status_text.markdown("🎵 **Generating Chinese Speech...**")
58
- output_path = generate_speech(chinese_text)
59
  progress_bar.progress(100)
60
 
61
  # Display results
 
55
 
56
  # TTS Phase
57
  status_text.markdown("🎵 **Generating Chinese Speech...**")
58
+ output_path = generate_speech(chinese_text,language="zh")
59
  progress_bar.progress(100)
60
 
61
  # Display results
config/tts_config.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tts:
2
+ model: fish-speech-400m-v1
3
+ vocoder: hifigan-v1
4
+ device: auto
5
+ precision: fp16
6
+
7
+ generation:
8
+ temperature: 0.7
9
+ top_k: 20
10
+ max_length: 4096
11
+ language_mapping:
12
+ zh: "[ZH]{text}[ZH]"
13
+ en: "[EN]{text}[EN]"
download_models.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from fish_audio.sdk.utils import download_all_models
2
+
3
+ if __name__ == "__main__":
4
+ download_all_models()
5
+ print("All models downloaded to ~/.cache/fish_audio")
requirements.txt CHANGED
@@ -4,10 +4,7 @@ nltk>=3.8
4
  librosa>=0.10
5
  soundfile>=0.12
6
  ffmpeg-python>=0.2
7
- torch>=2.0,<3.0
8
  transformers[audio]>=4.33
9
- TTS>=0.20,<1.0
10
-
11
- numpy<1.28 # 强制使用 NumPy 1.x 版本
12
- scipy~=1.11.0 # 与 NumPy 1.x 兼容的 SciPy 版本
13
- scikit-learn~=1.3.0 # 兼容旧版 NumPy 的 scikit-learn
 
4
  librosa>=0.10
5
  soundfile>=0.12
6
  ffmpeg-python>=0.2
 
7
  transformers[audio]>=4.33
8
+ fish-audio-sdk>=0.0.7
9
+ torch>=2.1.0
10
+ torchaudio>=2.1.0
 
 
utils/tts.py CHANGED
@@ -1,46 +1,54 @@
1
- """
2
- Text-to-Speech Module using YourTTS
3
- Handles speech synthesis and output generation
4
- """
5
-
6
- from TTS.api import TTS
7
- import os
8
  import time
 
 
 
 
 
9
 
10
- def generate_speech(text):
11
- """
12
- Convert Chinese text to natural-sounding speech
13
- Args:
14
- text: Input Chinese text
15
- Returns:
16
- Path to generated audio file
17
- """
18
- # Initialize TTS engine
19
- tts = TTS(
20
- model_name="tts_models/multilingual/multi-dataset/your_tts",
21
- progress_bar=False,
22
- gpu=False
23
- )
 
 
 
 
 
 
 
 
24
 
25
- # Create unique output filename
26
- output_path = os.path.join(
27
- "temp/outputs",
28
- f"output_{int(time.time())}.wav"
 
 
29
  )
30
 
31
- # Use reference voice if available
32
- ref_voice = (
33
- "assets/reference_voice.wav"
34
- if os.path.exists("assets/reference_voice.wav")
35
- else None
36
- )
37
 
38
- # Generate speech output
39
- tts.tts_to_file(
40
- text=text,
41
- speaker_wav=ref_voice,
42
- language="zh-cn",
43
- file_path=output_path
44
  )
45
 
 
 
 
 
46
  return output_path
 
 
 
 
 
 
 
 
1
  import time
2
+ import yaml
3
+ from pathlib import Path
4
+ import torch
5
+ from fish_audio.sdk import TextToSpeech, Vocoder
6
+ from pydub import AudioSegment
7
 
8
+ # Load config
9
+ config_path = Path(__file__).parent.parent / "config" / "tts_config.yaml"
10
+ with open(config_path) as f:
11
+ config = yaml.safe_load(f)
12
+
13
+ # Initialize models
14
+ tts_model = TextToSpeech(
15
+ model_name=config["tts"]["model"],
16
+ device=config["tts"]["device"],
17
+ precision=config["tts"]["precision"],
18
+ )
19
+
20
+ vocoder = Vocoder(
21
+ model_name=config["tts"]["vocoder"],
22
+ device=tts_model.device,
23
+ )
24
+
25
+ def generate_speech(text: str, language: str = "zh") -> str:
26
+ """Generate speech from text using Fish Audio SDK"""
27
+ # Format text with language tags
28
+ lang_template = config["generation"]["language_mapping"][language]
29
+ processed_text = lang_template.format(text=text)
30
 
31
+ # Generate mel spectrogram
32
+ mel = tts_model.generate(
33
+ text=processed_text,
34
+ temperature=config["generation"]["temperature"],
35
+ top_k=config["generation"]["top_k"],
36
+ max_length=config["generation"]["max_length"],
37
  )
38
 
39
+ # Convert mel to waveform
40
+ waveform = vocoder(mel)
 
 
 
 
41
 
42
+ # Create audio segment
43
+ audio = AudioSegment(
44
+ waveform.numpy().tobytes(),
45
+ frame_rate=vocoder.sample_rate,
46
+ sample_width=2,
47
+ channels=1,
48
  )
49
 
50
+ # Save output
51
+ output_path = f"temp/outputs/output_{int(time.time())}.wav"
52
+ audio.export(output_path, format="wav")
53
+
54
  return output_path