Spaces:
Running
Running
""" | |
hume_api.py | |
This file defines the interaction with the Hume text-to-speech (TTS) API. | |
It includes functionality for API request handling and processing API responses. | |
Key Features: | |
- Encapsulates all logic related to the Hume TTS API. | |
- Implements retry logic for handling transient API errors. | |
- Handles received audio and processes it for playback on the web. | |
- Provides detailed logging for debugging and error tracking. | |
Classes: | |
- HumeConfig: Immutable configuration for interacting with Hume's TTS API. | |
- HumeError: Custom exception for Hume API-related errors. | |
Functions: | |
- text_to_speech_with_hume: Synthesizes speech from text using Hume's TTS API. | |
""" | |
# Standard Library Imports | |
import logging | |
from dataclasses import dataclass, field | |
from typing import Any, Dict, Literal, Tuple, Union | |
# Third-Party Library Imports | |
import requests | |
from requests.exceptions import HTTPError | |
from tenacity import after_log, before_log, retry, stop_after_attempt, wait_fixed | |
# Local Application Imports | |
from src.config import Config, logger | |
from src.constants import CLIENT_ERROR_CODE, SERVER_ERROR_CODE | |
from src.utils import save_base64_audio_to_file, validate_env_var | |
HumeSupportedFileFormat = Literal["mp3", "pcm", "wav"] | |
"""Supported audio file formats for the Hume TTS API""" | |
class HumeConfig: | |
"""Immutable configuration for interacting with the Hume TTS API.""" | |
# Computed fields. | |
api_key: str = field(init=False) | |
headers: Dict[str, str] = field(init=False) | |
# Provided fields. | |
url: str = "https://test-api.hume.ai/v0/tts/octave" | |
file_format: HumeSupportedFileFormat = "mp3" | |
def __post_init__(self) -> None: | |
# Validate required attributes. | |
if not self.url: | |
raise ValueError("Hume TTS endpoint URL is not set.") | |
if not self.file_format: | |
raise ValueError("Hume TTS file format is not set.") | |
# Compute the API key from the environment. | |
computed_api_key = validate_env_var("HUME_API_KEY") | |
object.__setattr__(self, "api_key", computed_api_key) | |
# Compute the headers. | |
computed_headers = { | |
"X-Hume-Api-Key": f"{computed_api_key}", | |
"Content-Type": "application/json", | |
} | |
object.__setattr__(self, "headers", computed_headers) | |
class HumeError(Exception): | |
"""Custom exception for errors related to the Hume TTS API.""" | |
def __init__(self, message: str, original_exception: Union[Exception, None] = None): | |
super().__init__(message) | |
self.original_exception = original_exception | |
self.message = message | |
class UnretryableHumeError(HumeError): | |
"""Custom exception for errors related to the Hume TTS API that should not be retried.""" | |
def __init__(self, message: str, original_exception: Union[Exception, None] = None): | |
super().__init__(message, original_exception) | |
self.original_exception = original_exception | |
def text_to_speech_with_hume( | |
character_description: str, | |
text: str, | |
num_generations: int, | |
config: Config, | |
) -> Union[Tuple[str, str], Tuple[str, str, str, str]]: | |
""" | |
Synthesizes text to speech using the Hume TTS API, processes audio data, and writes audio to a file. | |
This function sends a POST request to the Hume TTS API with a character description and text | |
to be converted to speech. Depending on the specified number of generations (allowed values: 1 or 2), | |
the API returns one or two generations. For each generation, the function extracts the base64-encoded | |
audio and the generation ID, saves the audio as an MP3 file via the `save_base64_audio_to_file` helper, | |
and returns the relevant details. | |
Args: | |
character_description (str): A description of the character, which is used as contextual input | |
for generating the voice. | |
text (str): The text to be converted to speech. | |
num_generations (int): The number of audio generations to request from the API. | |
Allowed values are 1 or 2. If 1, only a single generation is processed; if 2, a second | |
generation is expected in the API response. | |
config (Config): The application configuration containing Hume API settings. | |
Returns: | |
Union[Tuple[str, str], Tuple[str, str, str, str]]: | |
- If num_generations == 1: (generation_a_id, audio_a_path). | |
- If num_generations == 2: (generation_a_id, audio_a_path, generation_b_id, audio_b_path). | |
Raises: | |
ValueError: If num_generations is not 1 or 2. | |
HumeError: If there is an error communicating with the Hume TTS API or parsing its response. | |
UnretryableHumeError: If a client-side HTTP error (status code in the 4xx range) is encountered. | |
Exception: Any other exceptions raised during the request or processing will be wrapped and | |
re-raised as HumeError. | |
""" | |
logger.debug( | |
f"Processing TTS with Hume. Prompt length: {len(character_description)} characters. " | |
f"Text length: {len(text)} characters." | |
) | |
if num_generations < 1 or num_generations > 2: | |
raise ValueError("Invalid number of generations specified. Must be 1 or 2.") | |
hume_config = config.hume_config | |
request_body = { | |
"utterances": [{"text": text, "description": character_description or None}], | |
"format": {"type": hume_config.file_format}, | |
"num_generations": num_generations, | |
} | |
try: | |
# Synthesize speech using the Hume TTS API | |
response = requests.post( | |
url=hume_config.url, | |
headers=hume_config.headers, | |
json=request_body, | |
) | |
response.raise_for_status() | |
response_data = response.json() | |
generations = response_data.get("generations") | |
if not generations: | |
msg = "No generations returned by Hume API." | |
logger.error(msg) | |
raise HumeError(msg) | |
# Extract the base64 encoded audio and generation ID from the generation. | |
generation_a = generations[0] | |
generation_a_id, audio_a_path = parse_hume_tts_generation(generation_a, config) | |
if num_generations == 1: | |
return (generation_a_id, audio_a_path) | |
generation_b = generations[1] | |
generation_b_id, audio_b_path = parse_hume_tts_generation(generation_b, config) | |
return (generation_a_id, audio_a_path, generation_b_id, audio_b_path) | |
except Exception as e: | |
if ( | |
isinstance(e, HTTPError) | |
and e.response is not None | |
and CLIENT_ERROR_CODE <= e.response.status_code < SERVER_ERROR_CODE | |
): | |
raise UnretryableHumeError( | |
message=f"{e.response.text}", | |
original_exception=e, | |
) from e | |
raise HumeError( | |
message=f"{e}", | |
original_exception=e, | |
) from e | |
def parse_hume_tts_generation(generation: Dict[str, Any], config: Config) -> Tuple[str, str]: | |
""" | |
Parse a Hume TTS generation response and save the decoded audio as an MP3 file. | |
This function extracts the generation ID and the base64-encoded audio from the provided | |
dictionary. It then decodes and saves the audio data to an MP3 file, naming the file using | |
the generation ID. Finally, it returns a tuple containing the generation ID and the file path | |
of the saved audio. | |
Args: | |
generation (Dict[str, Any]): A dictionary representing the TTS generation response from Hume. | |
Expected keys are: | |
- "generation_id" (str): A unique identifier for the generated audio. | |
- "audio" (str): A base64 encoded string of the audio data. | |
config (Config): The application configuration used for saving the audio file. | |
Returns: | |
Tuple[str, str]: A tuple containing: | |
- generation_id (str): The unique identifier for the audio generation. | |
- audio_path (str): The filesystem path where the audio file was saved. | |
Raises: | |
KeyError: If the "generation_id" or "audio" key is missing from the generation dictionary. | |
Exception: Propagates any exceptions raised by save_base64_audio_to_file, such as errors during | |
the decoding or file saving process. | |
""" | |
generation_id = generation.get("generation_id") | |
if generation_id is None: | |
raise KeyError("The generation dictionary is missing the 'generation_id' key.") | |
base64_audio = generation.get("audio") | |
if base64_audio is None: | |
raise KeyError("The generation dictionary is missing the 'audio' key.") | |
filename = f"{generation_id}.mp3" | |
audio_file_path = save_base64_audio_to_file(base64_audio, filename, config) | |
return generation_id, audio_file_path | |