Spaces:

pets123
/

sesame_openai

Paused

sesame_openai / app /voice_embeddings.py

01115c6 22 days ago

3.45 kB

	"""Voice embeddings for consistent voice generation."""
	import os
	import torch
	import torchaudio
	import numpy as np
	from typing import Dict

	# Path to store voice samples
	VOICE_SAMPLES_DIR = os.path.join(os.path.dirname(__file__), "voice_samples")
	os.makedirs(VOICE_SAMPLES_DIR, exist_ok=True)

	# Dictionary to store voice embeddings/samples
	VOICE_DICT: Dict[str, torch.Tensor] = {}


	def initialize_voices(sample_rate: int = 24000):
	"""Initialize voice dictionary with consistent samples."""
	# Generate consistent seed audio for each voice
	for voice_id in range(6):
	voice_name = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"][voice_id]

	# Create deterministic audio sample for each voice
	np.random.seed(voice_id + 42) # Use a fixed seed based on voice ID

	# Generate 1 second of "seed" audio with deterministic characteristics
	# This differs per voice but remains consistent across runs
	duration = 1.0 # seconds
	t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)

	# Create a distinctive waveform for each voice
	if voice_id == 0: # alloy - rich mid tones
	freq1, freq2 = 220, 440
	audio = 0.5 * np.sin(2 * np.pi * freq1 * t) + 0.3 * np.sin(2 * np.pi * freq2 * t)
	elif voice_id == 1: # echo - reverberant
	freq = 330
	audio = np.sin(2 * np.pi * freq * t) * np.exp(-t * 3)
	elif voice_id == 2: # fable - bright, higher pitch
	freq = 523
	audio = 0.7 * np.sin(2 * np.pi * freq * t)
	elif voice_id == 3: # onyx - deep and resonant
	freq = 165
	audio = 0.8 * np.sin(2 * np.pi * freq * t)
	elif voice_id == 4: # nova - warm and smooth
	freq1, freq2 = 392, 196
	audio = 0.4 * np.sin(2 * np.pi * freq1 * t) + 0.4 * np.sin(2 * np.pi * freq2 * t)
	else: # shimmer - airy and light
	freq1, freq2, freq3 = 587, 880, 1174
	audio = 0.3 * np.sin(2 * np.pi * freq1 * t) + 0.2 * np.sin(2 * np.pi * freq2 * t) + 0.1 * np.sin(2 * np.pi * freq3 * t)

	# Normalize
	audio = audio / np.max(np.abs(audio))

	# Convert to tensor
	audio_tensor = torch.tensor(audio, dtype=torch.float32)

	# Store the audio tensor
	VOICE_DICT[voice_name] = audio_tensor

	# Save as wav for reference
	save_path = os.path.join(VOICE_SAMPLES_DIR, f"{voice_name}_seed.wav")
	torchaudio.save(save_path, audio_tensor.unsqueeze(0), sample_rate)

	print(f"Initialized voice seed for {voice_name}")


	def get_voice_sample(voice_name: str) -> torch.Tensor:
	"""Get the voice sample for a given voice name."""
	if not VOICE_DICT:
	initialize_voices()

	if voice_name in VOICE_DICT:
	return VOICE_DICT[voice_name]

	# Default to alloy if voice not found
	print(f"Voice {voice_name} not found, defaulting to alloy")
	return VOICE_DICT["alloy"]


	def update_voice_sample(voice_name: str, audio: torch.Tensor):
	"""Update the voice sample with recently generated audio."""
	# Only update if we've already initialized
	if VOICE_DICT:
	# Take the last second of audio (or whatever is available)
	sample_length = min(24000, audio.shape[0])
	VOICE_DICT[voice_name] = audio[-sample_length:].detach().cpu()