zach
Fix types in integrations package
fc85b67
raw
history blame
8.96 kB
"""
hume_api.py
This file defines the interaction with the Hume text-to-speech (TTS) API.
It includes functionality for API request handling and processing API responses.
Key Features:
- Encapsulates all logic related to the Hume TTS API.
- Implements retry logic for handling transient API errors.
- Handles received audio and processes it for playback on the web.
- Provides detailed logging for debugging and error tracking.
Classes:
- HumeConfig: Immutable configuration for interacting with Hume's TTS API.
- HumeError: Custom exception for Hume API-related errors.
Functions:
- text_to_speech_with_hume: Synthesizes speech from text using Hume's TTS API.
"""
# Standard Library Imports
import logging
from dataclasses import dataclass, field
from typing import Any, Dict, Literal, Tuple, Union
# Third-Party Library Imports
import requests
from requests.exceptions import HTTPError
from tenacity import after_log, before_log, retry, stop_after_attempt, wait_fixed
# Local Application Imports
from src.config import Config, logger
from src.constants import CLIENT_ERROR_CODE, SERVER_ERROR_CODE
from src.utils import save_base64_audio_to_file, validate_env_var
HumeSupportedFileFormat = Literal["mp3", "pcm", "wav"]
"""Supported audio file formats for the Hume TTS API"""
@dataclass(frozen=True)
class HumeConfig:
"""Immutable configuration for interacting with the Hume TTS API."""
# Computed fields.
api_key: str = field(init=False)
headers: Dict[str, str] = field(init=False)
# Provided fields.
url: str = "https://test-api.hume.ai/v0/tts/octave"
file_format: HumeSupportedFileFormat = "mp3"
def __post_init__(self) -> None:
# Validate required attributes.
if not self.url:
raise ValueError("Hume TTS endpoint URL is not set.")
if not self.file_format:
raise ValueError("Hume TTS file format is not set.")
# Compute the API key from the environment.
computed_api_key = validate_env_var("HUME_API_KEY")
object.__setattr__(self, "api_key", computed_api_key)
# Compute the headers.
computed_headers = {
"X-Hume-Api-Key": f"{computed_api_key}",
"Content-Type": "application/json",
}
object.__setattr__(self, "headers", computed_headers)
class HumeError(Exception):
"""Custom exception for errors related to the Hume TTS API."""
def __init__(self, message: str, original_exception: Union[Exception, None] = None):
super().__init__(message)
self.original_exception = original_exception
self.message = message
class UnretryableHumeError(HumeError):
"""Custom exception for errors related to the Hume TTS API that should not be retried."""
def __init__(self, message: str, original_exception: Union[Exception, None] = None):
super().__init__(message, original_exception)
self.original_exception = original_exception
@retry(
stop=stop_after_attempt(3),
wait=wait_fixed(2),
before=before_log(logger, logging.DEBUG),
after=after_log(logger, logging.DEBUG),
reraise=True,
)
def text_to_speech_with_hume(
character_description: str,
text: str,
num_generations: int,
config: Config,
) -> Union[Tuple[str, str], Tuple[str, str, str, str]]:
"""
Synthesizes text to speech using the Hume TTS API, processes audio data, and writes audio to a file.
This function sends a POST request to the Hume TTS API with a character description and text
to be converted to speech. Depending on the specified number of generations (allowed values: 1 or 2),
the API returns one or two generations. For each generation, the function extracts the base64-encoded
audio and the generation ID, saves the audio as an MP3 file via the `save_base64_audio_to_file` helper,
and returns the relevant details.
Args:
character_description (str): A description of the character, which is used as contextual input
for generating the voice.
text (str): The text to be converted to speech.
num_generations (int): The number of audio generations to request from the API.
Allowed values are 1 or 2. If 1, only a single generation is processed; if 2, a second
generation is expected in the API response.
config (Config): The application configuration containing Hume API settings.
Returns:
Union[Tuple[str, str], Tuple[str, str, str, str]]:
- If num_generations == 1: (generation_a_id, audio_a_path).
- If num_generations == 2: (generation_a_id, audio_a_path, generation_b_id, audio_b_path).
Raises:
ValueError: If num_generations is not 1 or 2.
HumeError: If there is an error communicating with the Hume TTS API or parsing its response.
UnretryableHumeError: If a client-side HTTP error (status code in the 4xx range) is encountered.
Exception: Any other exceptions raised during the request or processing will be wrapped and
re-raised as HumeError.
"""
logger.debug(
f"Processing TTS with Hume. Prompt length: {len(character_description)} characters. "
f"Text length: {len(text)} characters."
)
if num_generations < 1 or num_generations > 2:
raise ValueError("Invalid number of generations specified. Must be 1 or 2.")
hume_config = config.hume_config
request_body = {
"utterances": [{"text": text, "description": character_description or None}],
"format": {"type": hume_config.file_format},
"num_generations": num_generations,
}
try:
# Synthesize speech using the Hume TTS API
response = requests.post(
url=hume_config.url,
headers=hume_config.headers,
json=request_body,
)
response.raise_for_status()
response_data = response.json()
generations = response_data.get("generations")
if not generations:
msg = "No generations returned by Hume API."
logger.error(msg)
raise HumeError(msg)
# Extract the base64 encoded audio and generation ID from the generation.
generation_a = generations[0]
generation_a_id, audio_a_path = parse_hume_tts_generation(generation_a, config)
if num_generations == 1:
return (generation_a_id, audio_a_path)
generation_b = generations[1]
generation_b_id, audio_b_path = parse_hume_tts_generation(generation_b, config)
return (generation_a_id, audio_a_path, generation_b_id, audio_b_path)
except Exception as e:
if (
isinstance(e, HTTPError)
and e.response is not None
and CLIENT_ERROR_CODE <= e.response.status_code < SERVER_ERROR_CODE
):
raise UnretryableHumeError(
message=f"{e.response.text}",
original_exception=e,
) from e
raise HumeError(
message=f"{e}",
original_exception=e,
) from e
def parse_hume_tts_generation(generation: Dict[str, Any], config: Config) -> Tuple[str, str]:
"""
Parse a Hume TTS generation response and save the decoded audio as an MP3 file.
This function extracts the generation ID and the base64-encoded audio from the provided
dictionary. It then decodes and saves the audio data to an MP3 file, naming the file using
the generation ID. Finally, it returns a tuple containing the generation ID and the file path
of the saved audio.
Args:
generation (Dict[str, Any]): A dictionary representing the TTS generation response from Hume.
Expected keys are:
- "generation_id" (str): A unique identifier for the generated audio.
- "audio" (str): A base64 encoded string of the audio data.
config (Config): The application configuration used for saving the audio file.
Returns:
Tuple[str, str]: A tuple containing:
- generation_id (str): The unique identifier for the audio generation.
- audio_path (str): The filesystem path where the audio file was saved.
Raises:
KeyError: If the "generation_id" or "audio" key is missing from the generation dictionary.
Exception: Propagates any exceptions raised by save_base64_audio_to_file, such as errors during
the decoding or file saving process.
"""
generation_id = generation.get("generation_id")
if generation_id is None:
raise KeyError("The generation dictionary is missing the 'generation_id' key.")
base64_audio = generation.get("audio")
if base64_audio is None:
raise KeyError("The generation dictionary is missing the 'audio' key.")
filename = f"{generation_id}.mp3"
audio_file_path = save_base64_audio_to_file(base64_audio, filename, config)
return generation_id, audio_file_path