Spaces:

HumeAI
/

expressive-tts-arena

Running

expressive-tts-arena / src /integrations /hume_api.py

zach

Fix types in integrations package

fc85b67 4 months ago

8.96 kB

	"""
	hume_api.py

	This file defines the interaction with the Hume text-to-speech (TTS) API.
	It includes functionality for API request handling and processing API responses.

	Key Features:
	- Encapsulates all logic related to the Hume TTS API.
	- Implements retry logic for handling transient API errors.
	- Handles received audio and processes it for playback on the web.
	- Provides detailed logging for debugging and error tracking.

	Classes:
	- HumeConfig: Immutable configuration for interacting with Hume's TTS API.
	- HumeError: Custom exception for Hume API-related errors.

	Functions:
	- text_to_speech_with_hume: Synthesizes speech from text using Hume's TTS API.
	"""

	# Standard Library Imports
	import logging
	from dataclasses import dataclass, field
	from typing import Any, Dict, Literal, Tuple, Union

	# Third-Party Library Imports
	import requests
	from requests.exceptions import HTTPError
	from tenacity import after_log, before_log, retry, stop_after_attempt, wait_fixed

	# Local Application Imports
	from src.config import Config, logger
	from src.constants import CLIENT_ERROR_CODE, SERVER_ERROR_CODE
	from src.utils import save_base64_audio_to_file, validate_env_var

	HumeSupportedFileFormat = Literal["mp3", "pcm", "wav"]
	"""Supported audio file formats for the Hume TTS API"""


	@dataclass(frozen=True)
	class HumeConfig:
	"""Immutable configuration for interacting with the Hume TTS API."""

	# Computed fields.
	api_key: str = field(init=False)
	headers: Dict[str, str] = field(init=False)

	# Provided fields.
	url: str = "https://test-api.hume.ai/v0/tts/octave"
	file_format: HumeSupportedFileFormat = "mp3"

	def __post_init__(self) -> None:
	# Validate required attributes.
	if not self.url:
	raise ValueError("Hume TTS endpoint URL is not set.")
	if not self.file_format:
	raise ValueError("Hume TTS file format is not set.")

	# Compute the API key from the environment.
	computed_api_key = validate_env_var("HUME_API_KEY")
	object.__setattr__(self, "api_key", computed_api_key)

	# Compute the headers.
	computed_headers = {
	"X-Hume-Api-Key": f"{computed_api_key}",
	"Content-Type": "application/json",
	}
	object.__setattr__(self, "headers", computed_headers)


	class HumeError(Exception):
	"""Custom exception for errors related to the Hume TTS API."""

	def __init__(self, message: str, original_exception: Union[Exception, None] = None):
	super().__init__(message)
	self.original_exception = original_exception
	self.message = message


	class UnretryableHumeError(HumeError):
	"""Custom exception for errors related to the Hume TTS API that should not be retried."""

	def __init__(self, message: str, original_exception: Union[Exception, None] = None):
	super().__init__(message, original_exception)
	self.original_exception = original_exception


	@retry(
	stop=stop_after_attempt(3),
	wait=wait_fixed(2),
	before=before_log(logger, logging.DEBUG),
	after=after_log(logger, logging.DEBUG),
	reraise=True,
	)
	def text_to_speech_with_hume(
	character_description: str,
	text: str,
	num_generations: int,
	config: Config,
	) -> Union[Tuple[str, str], Tuple[str, str, str, str]]:
	"""
	Synthesizes text to speech using the Hume TTS API, processes audio data, and writes audio to a file.

	This function sends a POST request to the Hume TTS API with a character description and text
	to be converted to speech. Depending on the specified number of generations (allowed values: 1 or 2),
	the API returns one or two generations. For each generation, the function extracts the base64-encoded
	audio and the generation ID, saves the audio as an MP3 file via the `save_base64_audio_to_file` helper,
	and returns the relevant details.

	Args:
	character_description (str): A description of the character, which is used as contextual input
	for generating the voice.
	text (str): The text to be converted to speech.
	num_generations (int): The number of audio generations to request from the API.
	Allowed values are 1 or 2. If 1, only a single generation is processed; if 2, a second
	generation is expected in the API response.
	config (Config): The application configuration containing Hume API settings.

	Returns:
	Union[Tuple[str, str], Tuple[str, str, str, str]]:
	- If num_generations == 1: (generation_a_id, audio_a_path).
	- If num_generations == 2: (generation_a_id, audio_a_path, generation_b_id, audio_b_path).

	Raises:
	ValueError: If num_generations is not 1 or 2.
	HumeError: If there is an error communicating with the Hume TTS API or parsing its response.
	UnretryableHumeError: If a client-side HTTP error (status code in the 4xx range) is encountered.
	Exception: Any other exceptions raised during the request or processing will be wrapped and
	re-raised as HumeError.
	"""
	logger.debug(
	f"Processing TTS with Hume. Prompt length: {len(character_description)} characters. "
	f"Text length: {len(text)} characters."
	)

	if num_generations < 1 or num_generations > 2:
	raise ValueError("Invalid number of generations specified. Must be 1 or 2.")

	hume_config = config.hume_config
	request_body = {
	"utterances": [{"text": text, "description": character_description or None}],
	"format": {"type": hume_config.file_format},
	"num_generations": num_generations,
	}

	try:
	# Synthesize speech using the Hume TTS API
	response = requests.post(
	url=hume_config.url,
	headers=hume_config.headers,
	json=request_body,
	)
	response.raise_for_status()
	response_data = response.json()

	generations = response_data.get("generations")
	if not generations:
	msg = "No generations returned by Hume API."
	logger.error(msg)
	raise HumeError(msg)

	# Extract the base64 encoded audio and generation ID from the generation.
	generation_a = generations[0]
	generation_a_id, audio_a_path = parse_hume_tts_generation(generation_a, config)

	if num_generations == 1:
	return (generation_a_id, audio_a_path)

	generation_b = generations[1]
	generation_b_id, audio_b_path = parse_hume_tts_generation(generation_b, config)
	return (generation_a_id, audio_a_path, generation_b_id, audio_b_path)

	except Exception as e:
	if (
	isinstance(e, HTTPError)
	and e.response is not None
	and CLIENT_ERROR_CODE <= e.response.status_code < SERVER_ERROR_CODE
	):
	raise UnretryableHumeError(
	message=f"{e.response.text}",
	original_exception=e,
	) from e

	raise HumeError(
	message=f"{e}",
	original_exception=e,
	) from e


	def parse_hume_tts_generation(generation: Dict[str, Any], config: Config) -> Tuple[str, str]:
	"""
	Parse a Hume TTS generation response and save the decoded audio as an MP3 file.

	This function extracts the generation ID and the base64-encoded audio from the provided
	dictionary. It then decodes and saves the audio data to an MP3 file, naming the file using
	the generation ID. Finally, it returns a tuple containing the generation ID and the file path
	of the saved audio.

	Args:
	generation (Dict[str, Any]): A dictionary representing the TTS generation response from Hume.
	Expected keys are:
	- "generation_id" (str): A unique identifier for the generated audio.
	- "audio" (str): A base64 encoded string of the audio data.
	config (Config): The application configuration used for saving the audio file.

	Returns:
	Tuple[str, str]: A tuple containing:
	- generation_id (str): The unique identifier for the audio generation.
	- audio_path (str): The filesystem path where the audio file was saved.

	Raises:
	KeyError: If the "generation_id" or "audio" key is missing from the generation dictionary.
	Exception: Propagates any exceptions raised by save_base64_audio_to_file, such as errors during
	the decoding or file saving process.
	"""
	generation_id = generation.get("generation_id")
	if generation_id is None:
	raise KeyError("The generation dictionary is missing the 'generation_id' key.")

	base64_audio = generation.get("audio")
	if base64_audio is None:
	raise KeyError("The generation dictionary is missing the 'audio' key.")

	filename = f"{generation_id}.mp3"
	audio_file_path = save_base64_audio_to_file(base64_audio, filename, config)
	return generation_id, audio_file_path