Michael Hu commited on
Commit
9c8546d
·
1 Parent(s): 78cdfc3

switch to use kokoro

Browse files
Files changed (4) hide show
  1. config/tts_config.yaml +0 -13
  2. download_models.py +0 -5
  3. requirements.txt +6 -2
  4. utils/tts.py +66 -47
config/tts_config.yaml DELETED
@@ -1,13 +0,0 @@
1
- tts:
2
- model: fish-speech-400m-v1
3
- vocoder: hifigan-v1
4
- device: auto
5
- precision: fp16
6
-
7
- generation:
8
- temperature: 0.7
9
- top_k: 20
10
- max_length: 4096
11
- language_mapping:
12
- zh: "[ZH]{text}[ZH]"
13
- en: "[EN]{text}[EN]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
download_models.py DELETED
@@ -1,5 +0,0 @@
1
- from fish_audio.sdk.utils import download_all_models
2
-
3
- if __name__ == "__main__":
4
- download_all_models()
5
- print("All models downloaded to ~/.cache/fish_audio")
 
 
 
 
 
 
requirements.txt CHANGED
@@ -5,6 +5,10 @@ librosa>=0.10
5
  soundfile>=0.12
6
  ffmpeg-python>=0.2
7
  transformers[audio]>=4.33
8
- fish-audio-sdk>=0.0.7
9
  torch>=2.1.0
10
- torchaudio>=2.1.0
 
 
 
 
 
 
5
  soundfile>=0.12
6
  ffmpeg-python>=0.2
7
  transformers[audio]>=4.33
 
8
  torch>=2.1.0
9
+ torchaudio>=2.1.0
10
+ phonemizer>=3.0
11
+ espeak-ng>=1.51
12
+ scipy>=1.11
13
+ munch>=2.5
14
+ git+https://github.com/hexgrad/Kokoro-82M
utils/tts.py CHANGED
@@ -1,54 +1,73 @@
1
- import time
2
- import yaml
3
- from pathlib import Path
4
  import torch
5
- from fish_audio_sdk import TextToSpeech, Vocoder
6
  from pydub import AudioSegment
 
 
 
 
 
 
7
 
8
- # Load config
9
- config_path = Path(__file__).parent.parent / "config" / "tts_config.yaml"
10
- with open(config_path) as f:
11
- config = yaml.safe_load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # Initialize models
14
- tts_model = TextToSpeech(
15
- model_name=config["tts"]["model"],
16
- device=config["tts"]["device"],
17
- precision=config["tts"]["precision"],
18
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- vocoder = Vocoder(
21
- model_name=config["tts"]["vocoder"],
22
- device=tts_model.device,
23
- )
24
 
25
  def generate_speech(text: str, language: str = "zh") -> str:
26
- """Generate speech from text using Fish Audio SDK"""
27
- # Format text with language tags
28
- lang_template = config["generation"]["language_mapping"][language]
29
- processed_text = lang_template.format(text=text)
30
-
31
- # Generate mel spectrogram
32
- mel = tts_model.generate(
33
- text=processed_text,
34
- temperature=config["generation"]["temperature"],
35
- top_k=config["generation"]["top_k"],
36
- max_length=config["generation"]["max_length"],
37
- )
38
-
39
- # Convert mel to waveform
40
- waveform = vocoder(mel)
41
-
42
- # Create audio segment
43
- audio = AudioSegment(
44
- waveform.numpy().tobytes(),
45
- frame_rate=vocoder.sample_rate,
46
- sample_width=2,
47
- channels=1,
48
- )
49
-
50
- # Save output
51
- output_path = f"temp/outputs/output_{int(time.time())}.wav"
52
- audio.export(output_path, format="wav")
53
-
54
- return output_path
 
1
+ import os
 
 
2
  import torch
3
+ import time
4
  from pydub import AudioSegment
5
+ from phonemizer.backend.espeak.wrapper import EspeakWrapper
6
+ from models import build_model
7
+
8
+ # Hugging Face Spaces setup
9
+ MODEL_DIR = "./kokoro"
10
+ os.makedirs(MODEL_DIR, exist_ok=True)
11
 
12
+ # Configure espeak-ng for Hugging Face environment
13
+ EspeakWrapper.set_library('/usr/lib/x86_64-linux-gnu/libespeak-ng.so.1')
14
+
15
+ class TTSEngine:
16
+ def __init__(self):
17
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
18
+ self._verify_model_files()
19
+ self.model = build_model(f"{MODEL_DIR}/kokoro-v0_19.pth", self.device)
20
+ self.voice = torch.load(f"{MODEL_DIR}/voices/af_bella.pt",
21
+ map_location=self.device)
22
+
23
+ def _verify_model_files(self):
24
+ """Ensure required model files exist"""
25
+ required_files = [
26
+ f"{MODEL_DIR}/kokoro-v0_19.pth",
27
+ f"{MODEL_DIR}/voices/af_bella.pt"
28
+ ]
29
+
30
+ missing = [f for f in required_files if not os.path.exists(f)]
31
+ if missing:
32
+ raise FileNotFoundError(
33
+ f"Missing model files: {missing}\n"
34
+ "Add this to your Hugging Face Space settings:\n"
35
+ "App setup -> Clone Kokoro repository: "
36
+ "git clone https://huggingface.co/hexgrad/Kokoro-82M ./kokoro"
37
+ )
38
 
39
+ def generate_speech(self, text: str, language: str = "zh") -> str:
40
+ """Generate speech from Chinese text"""
41
+ from kokoro import generate_full
42
+
43
+ # Safety checks for Hugging Face Free Tier
44
+ if len(text) > 500:
45
+ text = text[:495] + "[TRUNCATED]"
46
+
47
+ audio, _ = generate_full(
48
+ self.model,
49
+ text,
50
+ self.voice,
51
+ lang='en-us',
52
+ max_len=200 if self.device == "cpu" else 500
53
+ )
54
+
55
+ # Save output
56
+ output_path = f"temp/outputs/output_{int(time.time())}.wav"
57
+ AudioSegment(
58
+ audio.numpy().tobytes(),
59
+ frame_rate=24000,
60
+ sample_width=2,
61
+ channels=1
62
+ ).export(output_path, format="wav")
63
+
64
+ return output_path
65
 
66
+ # Initialize TTS engine once
67
+ @st.cache_resource
68
+ def get_tts_engine():
69
+ return TTSEngine()
70
 
71
  def generate_speech(text: str, language: str = "zh") -> str:
72
+ """Public interface for TTS generation"""
73
+ return get_tts_engine().generate_speech(text, language)