from kokoro import KPipeline import soundfile as sf import numpy as np import logging try: import spaces @spaces.GPU(duration=60) def get_generator(pipeline, text, voice, speed, split_pattern): return pipeline(text, voice=voice, speed=speed, split_pattern=r"\.") logging.info("Running TTS in spaces") except: logging.info("Spaces not available") def get_generator(pipeline, text, voice, speed, split_pattern): return pipeline(text, voice=voice, speed=speed, split_pattern=r"\.") pipeline = KPipeline(lang_code="a") try: pipeline = pipeline.to("cuda") except: logging.warning("CUDA not available, using CPU") def generate_audio( text, voice="af_heart", speed=1, save_segments=False, progress=None, ): """ Generate audio from text using Kokoro TTS pipeline Args: text (str): Text to convert to speech lang_code (str): Language code for the TTS model voice (str): Voice ID to use speed (float): Speech speed multiplier save_segments (bool): Whether to save individual audio segments Returns: numpy.ndarray: Combined audio data at 24kHz sample rate """ generator = get_generator(pipeline, text, voice, speed, split_pattern="\.") all_audio = [] segments = list(generator) # Get total number of segments for i, (gs, ps, audio) in enumerate( progress.tqdm(segments, desc="Generating audio") ): logging.info("Processing segment") logging.info(f"Graphemes: {gs}") logging.info(f"Phonemes: {ps}") all_audio.append(audio) if save_segments: sf.write(f"segment_{i}.wav", audio, 24000) # Concatenate all audio segments combined_audio = np.concatenate(all_audio) return combined_audio if __name__ == "__main__": # Example usage sample_text = "Hello world" audio_data = generate_audio(sample_text) sf.write("out.wav", audio_data, 24000)