Spaces:

pets123
/

sesame_openai

Paused

sesame_openai / app /prompt_engineering.py

01115c6 22 days ago

5.49 kB

	"""Prompt engineering for consistent voice generation."""
	import re
	import random
	from typing import List, Dict, Optional
	import logging

	# Set up logging
	logger = logging.getLogger(__name__)

	# Voice style descriptors for consistent prompting
	VOICE_STYLES = {
	"alloy": {
	"adjectives": ["balanced", "natural", "clear", "articulate", "neutral", "conversational"],
	"characteristics": ["medium pitch", "even pacing", "neutral tone", "balanced resonance"],
	"speaking_style": "conversational and balanced"
	},
	"echo": {
	"adjectives": ["resonant", "deep", "reverberant", "rich", "sonorous", "full"],
	"characteristics": ["lower pitch", "deliberate pacing", "resonant tone", "deeper timbre"],
	"speaking_style": "rich and resonant"
	},
	"fable": {
	"adjectives": ["bright", "light", "clear", "energetic", "articulate", "animated"],
	"characteristics": ["higher pitch", "lively pacing", "bright tone", "clear articulation"],
	"speaking_style": "bright and energetic"
	},
	"onyx": {
	"adjectives": ["deep", "authoritative", "powerful", "commanding", "strong", "resolute"],
	"characteristics": ["low pitch", "measured pacing", "authoritative tone", "strong projection"],
	"speaking_style": "deep and authoritative"
	},
	"nova": {
	"adjectives": ["warm", "pleasant", "smooth", "harmonious", "gentle", "comforting"],
	"characteristics": ["medium pitch", "smooth pacing", "warm tone", "pleasant timbre"],
	"speaking_style": "warm and smooth"
	},
	"shimmer": {
	"adjectives": ["light", "airy", "bright", "crystalline", "delicate", "expressive"],
	"characteristics": ["higher pitch", "quick pacing", "light tone", "bright timbre"],
	"speaking_style": "light and expressive"
	},
	"custom": {
	"adjectives": ["clear", "distinct", "authentic", "natural", "personalized", "unique"],
	"characteristics": ["natural rhythm", "authentic tone", "personal inflection", "distinctive sound"],
	"speaking_style": "authentic and natural"
	}
	}

	def initialize_templates():
	"""Initialize prompt templates - placeholder for any future setup."""
	logger.info("Prompt templates initialized")
	return VOICE_STYLES

	def split_into_segments(text: str, max_chars: int = 150) -> List[str]:
	"""Split text into optimal segments for better generation.
	Args:
	text: Text to split
	max_chars: Maximum characters per segment
	Returns:
	List of text segments
	"""
	# Handle empty or very short text
	if not text or len(text) <= max_chars:
	return [text]

	# Split by sentences first
	sentences = re.split(r'(?<=[.!?])\s+', text)

	# Initialize segments
	segments = []
	current_segment = ""

	for sentence in sentences:
	# If adding this sentence would exceed max_chars
	if len(current_segment) + len(sentence) > max_chars:
	# If current segment is not empty, add it to segments
	if current_segment:
	segments.append(current_segment.strip())
	current_segment = ""

	# If this sentence alone exceeds max_chars, split it by phrases
	if len(sentence) > max_chars:
	phrases = re.split(r'(?<=[,;:])\s+', sentence)
	for phrase in phrases:
	if len(phrase) > max_chars:
	# Split long phrases into chunks
	words = phrase.split()
	chunk = ""
	for word in words:
	if len(chunk) + len(word) + 1 <= max_chars:
	chunk += " " + word if chunk else word
	else:
	segments.append(chunk.strip())
	chunk = word
	if chunk:
	segments.append(chunk.strip())
	else:
	if len(current_segment) + len(phrase) <= max_chars:
	current_segment += " " + phrase if current_segment else phrase
	else:
	segments.append(current_segment.strip())
	current_segment = phrase
	else:
	current_segment = sentence
	else:
	current_segment += " " + sentence if current_segment else sentence

	# Add the last segment
	if current_segment:
	segments.append(current_segment.strip())

	logger.info(f"Split text into {len(segments)} segments")
	return segments

	def format_text_for_voice(text: str, voice_name: str, segment_index: int = 0, total_segments: int = 1) -> str:
	"""Format text with voice characteristics for more consistent generation.
	Args:
	text: Text to format
	voice_name: Name of the voice
	segment_index: Index of this segment (for multi-segment texts)
	total_segments: Total number of segments
	Returns:
	Formatted text optimized for consistent voice generation
	"""
	# IMPORTANT: We no longer add voice instructions in brackets since CSM reads them aloud
	# Instead, we're using speaker IDs to control voice identity which is what the model expects

	# Just return the unmodified text - the Generator class will handle proper formatting
	return text