Spaces:
Running
Running
File size: 4,347 Bytes
5ed9749 e07c653 5ed9749 e07c653 5ed9749 e07c653 5ed9749 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
# Standard Library Imports
import asyncio
import random
from typing import Tuple
# Local Application Imports
from src.common import Config, Option, OptionMap, TTSProviderName, logger
from src.common.constants import ELEVENLABS, HUME_AI, OPENAI
from src.integrations import (
text_to_speech_with_elevenlabs,
text_to_speech_with_hume,
text_to_speech_with_openai,
)
class TTSService:
"""
Service for coordinating text-to-speech generation across different providers.
This class handles the logic for selecting TTS providers, making concurrent API calls,
and processing the responses into a unified format for the frontend.
"""
def __init__(self, config: Config):
"""
Initialize the TTS service with application configuration.
Args:
config (Config): Application configuration containing API settings
"""
self.config = config
self.tts_provider_functions = {
HUME_AI: text_to_speech_with_hume,
ELEVENLABS: text_to_speech_with_elevenlabs,
OPENAI: text_to_speech_with_openai,
}
def __select_providers(self, text_modified: bool) -> Tuple[TTSProviderName, TTSProviderName]:
"""
Select 2 TTS providers based on whether the text has been modified.
Probabilities:
- 1/3 HUME_AI & OPENAI
- 1/3 HUME_AI & ELEVENLABS
- 1/3 OPENAI & ELEVENLABS
If the `text_modified` argument is `True`, then 100% HUME_AI, HUME_AI
Args:
text_modified (bool): A flag indicating whether the text has been modified
Returns:
tuple: A tuple (TTSProviderName, TTSProviderName)
"""
if text_modified:
return HUME_AI, HUME_AI
# When modifying the probability distribution, make sure the weights match the order of provider pairs
provider_pairs = [
(HUME_AI, OPENAI),
(HUME_AI, ELEVENLABS),
(OPENAI, ELEVENLABS),
]
weights = [1, 1, 1]
selected_pair = random.choices(provider_pairs, weights=weights, k=1)[0]
return selected_pair
async def synthesize_speech(
self,
character_description: str,
text: str,
text_modified: bool
) -> OptionMap:
"""
Generate speech for the given text using two different TTS providers.
This method selects appropriate providers based on the text modification status,
makes concurrent API calls to those providers, and returns the results.
Args:
character_description (str): Description of the character/voice for synthesis
text (str): The text to synthesize into speech
text_modified (bool): Whether the text has been modified from the original
Returns:
OptionMap: A mapping of shuffled TTS options, where each option includes
its provider, audio file path, and generation ID.
"""
provider_a, provider_b = self.__select_providers(text_modified)
logger.info(f"Starting speech synthesis with providers: {provider_a} and {provider_b}")
task_a = self.tts_provider_functions[provider_a](character_description, text, self.config)
task_b = self.tts_provider_functions[provider_b](character_description, text, self.config)
(generation_id_a, audio_a), (generation_id_b, audio_b) = await asyncio.gather(task_a, task_b)
logger.info(f"Synthesis succeeded for providers: {provider_a} and {provider_b}")
option_a = Option(provider=provider_a, audio=audio_a, generation_id=generation_id_a)
option_b = Option(provider=provider_b, audio=audio_b, generation_id=generation_id_b)
options = [option_a, option_b]
random.shuffle(options)
shuffled_option_a, shuffled_option_b = options
return {
"option_a": {
"provider": shuffled_option_a.provider,
"generation_id": shuffled_option_a.generation_id,
"audio_file_path": shuffled_option_a.audio,
},
"option_b": {
"provider": shuffled_option_b.provider,
"generation_id": shuffled_option_b.generation_id,
"audio_file_path": shuffled_option_b.audio,
},
}
|