Spaces:

HumeAI
/

expressive-tts-arena

Running

expressive-tts-arena / src /integrations /elevenlabs_api.py

zach

Fix types in integrations package

fc85b67 4 months ago

5.47 kB

	"""
	elevenlabs_api.py

	This file defines the interaction with the ElevenLabs text-to-speech (TTS) API using the
	ElevenLabs Python SDK. It includes functionality for API request handling and processing API responses.

	Key Features:
	- Encapsulates all logic related to the ElevenLabs TTS API.
	- Implements retry logic using Tenacity for handling transient API errors.
	- Handles received audio and processes it for playback on the web.
	- Provides detailed logging for debugging and error tracking.
	- Utilizes robust error handling (EAFP) to validate API responses.

	Classes:
	- ElevenLabsConfig: Immutable configuration for interacting with ElevenLabs' TTS API.
	- ElevenLabsError: Custom exception for ElevenLabs API-related errors.

	Functions:
	- text_to_speech_with_elevenlabs: Synthesizes speech from text using ElevenLabs' TTS API.
	"""

	# Standard Library Imports
	import logging
	import random
	from dataclasses import dataclass, field
	from typing import Optional, Tuple

	# Third-Party Library Imports
	from elevenlabs import ElevenLabs, TextToVoiceCreatePreviewsRequestOutputFormat
	from elevenlabs.core import ApiError
	from tenacity import after_log, before_log, retry, stop_after_attempt, wait_fixed

	# Local Application Imports
	from src.config import Config, logger
	from src.constants import CLIENT_ERROR_CODE, SERVER_ERROR_CODE
	from src.utils import save_base64_audio_to_file, validate_env_var


	@dataclass(frozen=True)
	class ElevenLabsConfig:
	"""Immutable configuration for interacting with the ElevenLabs TTS API."""

	api_key: str = field(init=False)
	output_format: TextToVoiceCreatePreviewsRequestOutputFormat = "mp3_44100_128"

	def __post_init__(self):
	# Validate required attributes.
	if not self.output_format:
	raise ValueError("ElevenLabs TTS API output format is not set.")

	# Compute the API key from the environment.
	computed_key = validate_env_var("ELEVENLABS_API_KEY")
	object.__setattr__(self, "api_key", computed_key)

	@property
	def client(self) -> ElevenLabs:
	"""
	Lazy initialization of the ElevenLabs client.

	Returns:
	ElevenLabs: Configured client instance.
	"""
	return ElevenLabs(api_key=self.api_key)


	class ElevenLabsError(Exception):
	"""Custom exception for errors related to the ElevenLabs TTS API."""

	def __init__(self, message: str, original_exception: Optional[Exception] = None):
	super().__init__(message)
	self.original_exception = original_exception
	self.message = message


	class UnretryableElevenLabsError(ElevenLabsError):
	"""Custom exception for errors related to the ElevenLabs TTS API that should not be retried."""

	def __init__(self, message: str, original_exception: Optional[Exception] = None):
	super().__init__(message, original_exception)


	@retry(
	stop=stop_after_attempt(3),
	wait=wait_fixed(2),
	before=before_log(logger, logging.DEBUG),
	after=after_log(logger, logging.DEBUG),
	reraise=True,
	)
	def text_to_speech_with_elevenlabs(
	character_description: str, text: str, config: Config
	) -> Tuple[None, str]:
	"""
	Synthesizes text to speech using the ElevenLabs TTS API, processes the audio data, and writes it to a file.

	Args:
	character_description (str): The character description used as the voice description.
	text (str): The text to be synthesized into speech.

	Returns:
	Tuple[None, str]: A tuple containing:
	- generation_id (None): We do not record the generation ID for ElevenLabs, but return None for uniformity
	across TTS integrations.
	- file_path (str): The relative file path to the audio file where the synthesized speech was saved.

	Raises:
	ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
	"""
	logger.debug(f"Synthesizing speech with ElevenLabs. Text length: {len(text)} characters.")

	elevenlabs_config = config.elevenlabs_config

	try:
	# Synthesize speech using the ElevenLabs SDK
	response = elevenlabs_config.client.text_to_voice.create_previews(
	voice_description=character_description,
	text=text,
	output_format=elevenlabs_config.output_format,
	)

	previews = response.previews
	if not previews:
	msg = "No previews returned by ElevenLabs API."
	logger.error(msg)
	raise ElevenLabsError(message=msg)

	# Extract the base64 encoded audio and generated voice ID from the preview
	preview = random.choice(previews)
	generated_voice_id = preview.generated_voice_id
	base64_audio = preview.audio_base_64
	filename = f"{generated_voice_id}.mp3"
	audio_file_path = save_base64_audio_to_file(base64_audio, filename, config)

	# Write audio to file and return the relative path
	return None, audio_file_path

	except Exception as e:
	if (
	isinstance(e, ApiError)
	and e.status_code is not None
	and CLIENT_ERROR_CODE <= e.status_code < SERVER_ERROR_CODE
	):
	raise UnretryableElevenLabsError(
	message=f"{e.body['detail']['message']}",
	original_exception=e,
	) from e

	raise ElevenLabsError(
	message=f"{e}",
	original_exception=e,
	) from e