Spaces:

DroolingPanda
/

teachingAssistant

Running

App Files Files Community

Michael Hu commited on Jan 26

Commit

9c8546d

1 Parent(s): 78cdfc3

switch to use kokoro

Browse files

Files changed (4) hide show

config/tts_config.yaml +0 -13
download_models.py +0 -5
requirements.txt +6 -2
utils/tts.py +66 -47

config/tts_config.yaml DELETED Viewed

@@ -1,13 +0,0 @@
-tts:
-  model: fish-speech-400m-v1
-  vocoder: hifigan-v1
-  device: auto
-  precision: fp16
-generation:
-  temperature: 0.7
-  top_k: 20
-  max_length: 4096
-  language_mapping:
-    zh: "[ZH]{text}[ZH]"
-    en: "[EN]{text}[EN]"

download_models.py DELETED Viewed

@@ -1,5 +0,0 @@
-from fish_audio.sdk.utils import download_all_models
-if __name__ == "__main__":
-    download_all_models()
-    print("All models downloaded to ~/.cache/fish_audio")

requirements.txt CHANGED Viewed

@@ -5,6 +5,10 @@ librosa>=0.10
 soundfile>=0.12
 ffmpeg-python>=0.2
 transformers[audio]>=4.33
-fish-audio-sdk>=0.0.7
 torch>=2.1.0
-torchaudio>=2.1.0

 soundfile>=0.12
 ffmpeg-python>=0.2
 transformers[audio]>=4.33
 torch>=2.1.0
+torchaudio>=2.1.0
+phonemizer>=3.0
+espeak-ng>=1.51
+scipy>=1.11
+munch>=2.5
+git+https://github.com/hexgrad/Kokoro-82M

utils/tts.py CHANGED Viewed

@@ -1,54 +1,73 @@
-import time
-import yaml
-from pathlib import Path
 import torch
-from fish_audio_sdk import TextToSpeech, Vocoder
 from pydub import AudioSegment
-# Load config
-config_path = Path(__file__).parent.parent / "config" / "tts_config.yaml"
-with open(config_path) as f:
-    config = yaml.safe_load(f)
-# Initialize models
-tts_model = TextToSpeech(
-    model_name=config["tts"]["model"],
-    device=config["tts"]["device"],
-    precision=config["tts"]["precision"],
-)
-vocoder = Vocoder(
-    model_name=config["tts"]["vocoder"],
-    device=tts_model.device,
-)
 def generate_speech(text: str, language: str = "zh") -> str:
-    """Generate speech from text using Fish Audio SDK"""
-    # Format text with language tags
-    lang_template = config["generation"]["language_mapping"][language]
-    processed_text = lang_template.format(text=text)
-    # Generate mel spectrogram
-    mel = tts_model.generate(
-        text=processed_text,
-        temperature=config["generation"]["temperature"],
-        top_k=config["generation"]["top_k"],
-        max_length=config["generation"]["max_length"],
-    )
-    # Convert mel to waveform
-    waveform = vocoder(mel)
-    # Create audio segment
-    audio = AudioSegment(
-        waveform.numpy().tobytes(),
-        frame_rate=vocoder.sample_rate,
-        sample_width=2,
-        channels=1,
-    )
-    # Save output
-    output_path = f"temp/outputs/output_{int(time.time())}.wav"
-    audio.export(output_path, format="wav")
-    return output_path

+import os
 import torch
+import time
 from pydub import AudioSegment
+from phonemizer.backend.espeak.wrapper import EspeakWrapper
+from models import build_model
+# Hugging Face Spaces setup
+MODEL_DIR = "./kokoro"
+os.makedirs(MODEL_DIR, exist_ok=True)
+# Configure espeak-ng for Hugging Face environment
+EspeakWrapper.set_library('/usr/lib/x86_64-linux-gnu/libespeak-ng.so.1')
+class TTSEngine:
+    def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self._verify_model_files()
+        self.model = build_model(f"{MODEL_DIR}/kokoro-v0_19.pth", self.device)
+        self.voice = torch.load(f"{MODEL_DIR}/voices/af_bella.pt",
+                              map_location=self.device)
+    def _verify_model_files(self):
+        """Ensure required model files exist"""
+        required_files = [
+            f"{MODEL_DIR}/kokoro-v0_19.pth",
+            f"{MODEL_DIR}/voices/af_bella.pt"
+        ]
+        missing = [f for f in required_files if not os.path.exists(f)]
+        if missing:
+            raise FileNotFoundError(
+                f"Missing model files: {missing}\n"
+                "Add this to your Hugging Face Space settings:\n"
+                "App setup -> Clone Kokoro repository: "
+                "git clone https://huggingface.co/hexgrad/Kokoro-82M ./kokoro"
+            )
+    def generate_speech(self, text: str, language: str = "zh") -> str:
+        """Generate speech from Chinese text"""
+        from kokoro import generate_full
+        # Safety checks for Hugging Face Free Tier
+        if len(text) > 500:
+            text = text[:495] + "[TRUNCATED]"
+        audio, _ = generate_full(
+            self.model,
+            text,
+            self.voice,
+            lang='en-us',
+            max_len=200 if self.device == "cpu" else 500
+        )
+        # Save output
+        output_path = f"temp/outputs/output_{int(time.time())}.wav"
+        AudioSegment(
+            audio.numpy().tobytes(),
+            frame_rate=24000,
+            sample_width=2,
+            channels=1
+        ).export(output_path, format="wav")
+        return output_path
+# Initialize TTS engine once
+@st.cache_resource
+def get_tts_engine():
+    return TTSEngine()
 def generate_speech(text: str, language: str = "zh") -> str:
+    """Public interface for TTS generation"""
+    return get_tts_engine().generate_speech(text, language)