tortoise-tts / tortoise /socket_server.py
richardseattle2025's picture
Upload folder using huggingface_hub
819225f verified
import spacy
import threading
import socket
from tortoise.api_fast import TextToSpeech
from utils.audio import load_voices
tts = TextToSpeech()
nlp = spacy.load("en_core_web_sm")
def generate_audio_stream(text, tts, voice_samples):
print(f"Generating audio stream...: {text}")
voice_samples, conditioning_latents = load_voices([voice_samples])
stream = tts.tts_stream(
text,
voice_samples=voice_samples,
conditioning_latents=conditioning_latents,
verbose=True,
stream_chunk_size=40 # Adjust chunk size as needed
)
for audio_chunk in stream:
yield audio_chunk
def split_text(text, max_length=200):
doc = nlp(text)
chunks = []
chunk = []
length = 0
for sent in doc.sents:
sent_length = len(sent.text)
if length + sent_length > max_length:
chunks.append(' '.join(chunk))
chunk = []
length = 0
chunk.append(sent.text)
length += sent_length + 1
if chunk:
chunks.append(' '.join(chunk))
return chunks
def handle_client(client_socket, tts):
try:
while True:
data = client_socket.recv(1024).decode('utf-8')
if not data:
break
character_name, text = data.split('|', 1)
text_chunks = split_text(text, max_length=200)
print(text_chunks)
for chunk in text_chunks:
audio_stream = generate_audio_stream(chunk, tts, character_name)
for audio_chunk in audio_stream:
audio_data = audio_chunk.cpu().numpy().flatten()
client_socket.sendall(audio_data.tobytes())
client_socket.sendall(b"END_OF_AUDIO")
finally:
client_socket.close()
print("Client disconnected.")
def start_server():
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server.bind(('0.0.0.0', 5000))
server.listen(5)
print("Server listening on port 5000")
while True:
client_socket, addr = server.accept()
print(f"Accepted connection from {addr}")
client_handler = threading.Thread(target=handle_client, args=(client_socket, tts))
client_handler.start()
if __name__ == "__main__":
start_server()