from enum import Enum from typing import List, Literal, Optional, Union from pydantic import BaseModel, Field class VoiceCombineRequest(BaseModel): """Request schema for voice combination endpoint that accepts either a string with + or a list""" voices: Union[str, List[str]] = Field( ..., description="Either a string with voices separated by + (e.g. 'voice1+voice2') or a list of voice names to combine", ) class TTSStatus(str, Enum): PENDING = "pending" PROCESSING = "processing" COMPLETED = "completed" FAILED = "failed" DELETED = "deleted" # For files removed by cleanup # OpenAI-compatible schemas class WordTimestamp(BaseModel): """Word-level timestamp information""" word: str = Field(..., description="The word or token") start_time: float = Field(..., description="Start time in seconds") end_time: float = Field(..., description="End time in seconds") class CaptionedSpeechResponse(BaseModel): """Response schema for captioned speech endpoint""" audio: str = Field(..., description="The generated audio data encoded in base 64") audio_format: str = Field(..., description="The format of the output audio") timestamps: Optional[List[WordTimestamp]] = Field( ..., description="Word-level timestamps" ) class NormalizationOptions(BaseModel): """Options for the normalization system""" normalize: bool = Field( default=True, description="Normalizes input text to make it easier for the model to say", ) unit_normalization: bool = Field( default=False, description="Transforms units like 10KB to 10 kilobytes" ) url_normalization: bool = Field( default=True, description="Changes urls so they can be properly pronounced by kokoro", ) email_normalization: bool = Field( default=True, description="Changes emails so they can be properly pronouced by kokoro", ) optional_pluralization_normalization: bool = Field( default=True, description="Replaces (s) with s so some words get pronounced correctly", ) phone_normalization: bool = Field( default=True, description="Changes phone numbers so they can be properly pronouced by kokoro", ) class OpenAISpeechRequest(BaseModel): """Request schema for OpenAI-compatible speech endpoint""" model: str = Field( default="kokoro", description="The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro", ) input: str = Field(..., description="The text to generate audio for") voice: str = Field( default="af_heart", description="The voice to use for generation. Can be a base voice or a combined voice name.", ) response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field( default="mp3", description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.", ) download_format: Optional[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]] = ( Field( default=None, description="Optional different format for the final download. If not provided, uses response_format.", ) ) speed: float = Field( default=1.0, ge=0.25, le=4.0, description="The speed of the generated audio. Select a value from 0.25 to 4.0.", ) stream: bool = Field( default=True, # Default to streaming for OpenAI compatibility description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.", ) return_download_link: bool = Field( default=False, description="If true, returns a download link in X-Download-Path header after streaming completes", ) lang_code: Optional[str] = Field( default=None, description="Optional language code to use for text processing. If not provided, will use first letter of voice name.", ) normalization_options: Optional[NormalizationOptions] = Field( default=NormalizationOptions(), description="Options for the normalization system", ) class CaptionedSpeechRequest(BaseModel): """Request schema for captioned speech endpoint""" model: str = Field( default="kokoro", description="The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro", ) input: str = Field(..., description="The text to generate audio for") voice: str = Field( default="af_heart", description="The voice to use for generation. Can be a base voice or a combined voice name.", ) response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field( default="mp3", description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.", ) speed: float = Field( default=1.0, ge=0.25, le=4.0, description="The speed of the generated audio. Select a value from 0.25 to 4.0.", ) stream: bool = Field( default=True, # Default to streaming for OpenAI compatibility description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.", ) return_timestamps: bool = Field( default=True, description="If true (default), returns word-level timestamps in the response", ) return_download_link: bool = Field( default=False, description="If true, returns a download link in X-Download-Path header after streaming completes", ) lang_code: Optional[str] = Field( default=None, description="Optional language code to use for text processing. If not provided, will use first letter of voice name.", ) normalization_options: Optional[NormalizationOptions] = Field( default=NormalizationOptions(), description="Options for the normalization system", )