Spaces:
Runtime error
Runtime error
File size: 4,469 Bytes
05b45a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
#!/usr/bin/env python3
import os
import time
import wave
import numpy as np
import requests
import sounddevice as sd
def play_streaming_tts(text: str, output_file: str = None, voice: str = "af_sky"):
"""Stream TTS audio and play it back in real-time"""
print("\nStarting TTS stream request...")
start_time = time.time()
# Initialize variables
sample_rate = 24000 # Known sample rate for Kokoro
audio_started = False
chunk_count = 0
total_bytes = 0
first_chunk_time = None
all_audio_data = bytearray() # Raw PCM audio data
# Start sounddevice stream with buffer
stream = sd.OutputStream(
samplerate=sample_rate,
channels=1,
dtype=np.int16,
blocksize=1024, # Buffer size in samples
latency="low", # Request low latency
)
stream.start()
# Make streaming request to API
try:
response = requests.post(
"http://localhost:8880/v1/audio/speech",
json={
"model": "kokoro",
"input": text,
"voice": voice,
"response_format": "pcm",
"stream": True,
},
stream=True,
timeout=1800,
)
response.raise_for_status()
print(f"Request started successfully after {time.time() - start_time:.2f}s")
# Process streaming response with smaller chunks for lower latency
for chunk in response.iter_content(
chunk_size=512
): # 512 bytes = 256 samples at 16-bit
if chunk:
chunk_count += 1
total_bytes += len(chunk)
# Handle first chunk
if not audio_started:
first_chunk_time = time.time()
print(
f"\nReceived first chunk after {first_chunk_time - start_time:.2f}s"
)
print(f"First chunk size: {len(chunk)} bytes")
audio_started = True
# Convert bytes to numpy array and play
audio_chunk = np.frombuffer(chunk, dtype=np.int16)
stream.write(audio_chunk)
# Accumulate raw audio data
all_audio_data.extend(chunk)
# Log progress every 10 chunks
if chunk_count % 100 == 0:
elapsed = time.time() - start_time
print(
f"Progress: {chunk_count} chunks, {total_bytes/1024:.1f}KB received, {elapsed:.1f}s elapsed"
)
# Final stats
total_time = time.time() - start_time
print(f"\nStream complete:")
print(f"Total chunks: {chunk_count}")
print(f"Total data: {total_bytes/1024:.1f}KB")
print(f"Total time: {total_time:.2f}s")
print(f"Average speed: {(total_bytes/1024)/total_time:.1f}KB/s")
# Save as WAV file
if output_file:
print(f"\nWriting audio to {output_file}")
with wave.open(output_file, "wb") as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
wav_file.setframerate(sample_rate)
wav_file.writeframes(all_audio_data)
print(f"Saved {len(all_audio_data)} bytes of audio data")
# Clean up
stream.stop()
stream.close()
except requests.exceptions.ConnectionError as e:
print(f"Connection error - Is the server running? Error: {str(e)}")
stream.stop()
stream.close()
except Exception as e:
print(f"Error during streaming: {str(e)}")
stream.stop()
stream.close()
def main():
# Load sample text from HG Wells
script_dir = os.path.dirname(os.path.abspath(__file__))
wells_path = os.path.join(
script_dir, "assorted_checks/benchmarks/the_time_machine_hg_wells.txt"
)
output_path = os.path.join(script_dir, "output.wav")
with open(wells_path, "r", encoding="utf-8") as f:
full_text = f.read()
# Take first few paragraphs
text = " ".join(full_text.split("\n\n")[1:3])
print("\nStarting TTS stream playback...")
print(f"Text length: {len(text)} characters")
print("\nFirst 100 characters:")
print(text[:100] + "...")
play_streaming_tts(text, output_file=output_path)
if __name__ == "__main__":
main()
|