MyPod_10

Running

App Files Files Community

siddhartharyaai commited on Jan 13

Commit

7cd7655

verified ·

1 Parent(s): ed4e888

Update utils.py

Browse files

Files changed (1) hide show

utils.py +90 -139

utils.py CHANGED Viewed

@@ -18,14 +18,14 @@ import torch
 import random
 class DialogueItem(BaseModel):
-    speaker: Literal["Jane", "John"]   # For TTS voice
     display_speaker: str = "Jane"      # For display in transcript
     text: str
 class Dialogue(BaseModel):
     dialogue: List[DialogueItem]
-# Initialize Whisper ASR pipeline (unused for YouTube since we use RapidAPI)
 asr_pipeline = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-tiny.en",
@@ -33,10 +33,6 @@ asr_pipeline = pipeline(
 )
 def truncate_text(text, max_tokens=2048):
-    """
-    If the text exceeds the max token limit (approx. 2,048), truncate it
-    to avoid exceeding the model's context window.
-    """
     print("[LOG] Truncating text if needed.")
     tokenizer = tiktoken.get_encoding("cl100k_base")
     tokens = tokenizer.encode(text)
@@ -46,10 +42,6 @@ def truncate_text(text, max_tokens=2048):
     return text
 def extract_text_from_url(url):
-    """
-    Fetches and extracts readable text from a given URL
-    (stripping out scripts, styles, etc.).
-    """
     print("[LOG] Extracting text from URL:", url)
     try:
         headers = {
@@ -74,29 +66,17 @@ def extract_text_from_url(url):
         return ""
 def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
-    """
-    Shifts the pitch of an AudioSegment by a given number of semitones.
-    Positive semitones shift the pitch up, negative shifts it down.
-    """
     print(f"[LOG] Shifting pitch by {semitones} semitones.")
     new_sample_rate = int(audio.frame_rate * (2.0 ** (semitones / 12.0)))
     shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
     return shifted_audio.set_frame_rate(audio.frame_rate)
 def is_sufficient(text: str, min_word_count: int = 500) -> bool:
-    """
-    Checks if the fetched text meets our sufficiency criteria
-    (e.g., at least 500 words).
-    """
     word_count = len(text.split())
     print(f"[DEBUG] Aggregated word count: {word_count}")
     return word_count >= min_word_count
 def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
-    """
-    Queries the Groq API to retrieve more info from the LLM's knowledge base.
-    Appends it to our aggregated info if found.
-    """
     print("[LOG] Querying LLM for additional information.")
     system_prompt = (
         "You are an AI assistant with extensive knowledge up to 2023-10. "
@@ -122,10 +102,6 @@ def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
     return additional_info
 def research_topic(topic: str) -> str:
-    """
-    Gathers info from various RSS feeds and Wikipedia. If needed, queries the LLM
-    for more data if the aggregated text is insufficient.
-    """
     sources = {
         "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
         "CNN": "http://rss.cnn.com/rss/edition.rss",
@@ -136,15 +112,12 @@ def research_topic(topic: str) -> str:
         "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
         "Google News - Custom": f"https://news.google.com/rss/search?q={requests.utils.quote(topic)}&hl=en-IN&gl=IN&ceid=IN:en",
     }
     summary_parts = []
-    # Wikipedia summary
     wiki_summary = fetch_wikipedia_summary(topic)
     if wiki_summary:
         summary_parts.append(f"From Wikipedia: {wiki_summary}")
-    # For each RSS feed
     for name, feed_url in sources.items():
         try:
             items = fetch_rss_feed(feed_url)
@@ -165,7 +138,6 @@ def research_topic(topic: str) -> str:
     print("[DEBUG] Aggregated info from primary sources:")
     print(aggregated_info)
-    # Fallback to LLM if insufficient
     if not is_sufficient(aggregated_info):
         print("[LOG] Insufficient info from primary sources. Fallback to LLM.")
         additional_info = query_llm_for_additional_info(topic, aggregated_info)
@@ -180,9 +152,6 @@ def research_topic(topic: str) -> str:
     return aggregated_info
 def fetch_wikipedia_summary(topic: str) -> str:
-    """
-    Fetch a quick Wikipedia summary of the topic via the official Wikipedia API.
-    """
     print("[LOG] Fetching Wikipedia summary for:", topic)
     try:
         search_url = (
@@ -209,9 +178,6 @@ def fetch_wikipedia_summary(topic: str) -> str:
         return ""
 def fetch_rss_feed(feed_url: str) -> list:
-    """
-    Pulls RSS feed data from a given URL and returns items.
-    """
     print("[LOG] Fetching RSS feed:", feed_url)
     try:
         resp = requests.get(feed_url)
@@ -226,10 +192,6 @@ def fetch_rss_feed(feed_url: str) -> list:
         return []
 def find_relevant_article(items, topic: str, min_match=2) -> tuple:
-    """
-    Check each article in the RSS feed for mention of the topic
-    by counting the number of keyword matches.
-    """
     print("[LOG] Finding relevant articles...")
     keywords = re.findall(r'\w+', topic.lower())
     for item in items:
@@ -244,9 +206,6 @@ def find_relevant_article(items, topic: str, min_match=2) -> tuple:
     return None, None, None
 def fetch_article_text(link: str) -> str:
-    """
-    Fetch the article text from the given link (first 5 paragraphs).
-    """
     print("[LOG] Fetching article text from:", link)
     if not link:
         print("[LOG] No link provided for article text.")
@@ -274,17 +233,9 @@ def generate_script(
     guest_name: str = "John",
     sponsor_style: str = "Separate Break"
 ):
-    """
-    Sends the system_prompt plus input_text to the Groq LLM to generate a
-    multi-speaker Dialogue in JSON, returning a Dialogue object.
-    sponsor_style can be "Separate Break" or "Blended".
-    We add instructions telling the model how to integrate the sponsor content.
-    """
     print("[LOG] Generating script with tone:", tone, "and length:", target_length)
     groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
-    # Parse numeric minutes
     words_per_minute = 150
     numeric_minutes = 3
     match = re.search(r"(\d+)", target_length)
@@ -302,13 +253,12 @@ def generate_script(
     }
     chosen_tone = tone_map.get(tone, "casual")
-    # Sponsor instructions
     if sponsor_style == "Separate Break":
         sponsor_instructions = (
             "If sponsor content is provided, include it in a separate ad break (~30 seconds). "
             "Use phrasing like 'Now a word from our sponsor...' and end with 'Back to the show' or similar."
         )
-    else:
         sponsor_instructions = (
             "If sponsor content is provided, blend it naturally (~30 seconds) into the conversation. "
             "Avoid abrupt transitions."
@@ -334,7 +284,6 @@ def generate_script(
         "    ]\n"
         "}"
     )
     print("[LOG] Sending prompt to Groq:")
     print(prompt)
@@ -361,7 +310,6 @@ def generate_script(
         data = json.loads(json_str)
         dialogue_list = data.get("dialogue", [])
-        # Map raw speaker -> Jane or John, storing display_speaker
         for d in dialogue_list:
             raw_speaker = d.get("speaker", "Jane")
             if raw_speaker.lower() == host_name.lower():
@@ -371,7 +319,7 @@ def generate_script(
                 d["speaker"] = "John"
                 d["display_speaker"] = guest_name
             else:
-                d["speaker"] = "Jane"  # default
                 d["display_speaker"] = raw_speaker
         new_dialogue_items = []
@@ -389,17 +337,8 @@ def generate_script(
         print("[ERROR] JSON decoding failed:", e)
         raise ValueError(f"Failed to parse dialogue: {str(e)}")
-# -------------------------------------------------------
-# Replaces the old approach for YouTube with RapidAPI
-# -------------------------------------------------------
 def transcribe_youtube_video(video_url: str) -> str:
-    """
-    Transcribe a YouTube video by calling the RapidAPI 'youtube-transcriptor' endpoint.
-    1) Extract the 11-char video ID from the YouTube URL.
-    2) Call the RapidAPI endpoint (lang=en).
-    3) Parse 'transcriptionAsText' from the response.
-    4) Return that transcript as a string.
-    """
     print("[LOG] Transcribing YouTube video via RapidAPI:", video_url)
     video_id_match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", video_url)
     if not video_id_match:
@@ -436,10 +375,7 @@ def transcribe_youtube_video(video_url: str) -> str:
         print("[LOG] Transcript retrieval successful.")
         print(f"[DEBUG] Transcript Length: {len(transcript_as_text)} characters.")
-        if len(transcript_as_text) > 200:
-            snippet = transcript_as_text[:200] + "..."
-        else:
-            snippet = transcript_as_text
         print(f"[DEBUG] Transcript Snippet: {snippet}")
         return transcript_as_text
@@ -451,20 +387,18 @@ def transcribe_youtube_video(video_url: str) -> str:
 def generate_audio_mp3(text: str, speaker: str) -> str:
     """
     Calls Deepgram TTS with the text, returning a path to a temp MP3 file.
-    We also do some pre-processing for punctuation, abbreviations, etc.
     """
     try:
         print(f"[LOG] Generating audio for speaker: {speaker}")
-        # Preprocess text for TTS
         processed_text = _preprocess_text_for_tts(text, speaker)
         deepgram_api_url = "https://api.deepgram.com/v1/speak"
         params = {
-            "model": "aura-asteria-en",  # female voice by default
         }
         if speaker == "John":
-            params["model"] = "aura-zeus-en"  # male voice
         headers = {
             "Accept": "audio/mpeg",
@@ -489,7 +423,7 @@ def generate_audio_mp3(text: str, speaker: str) -> str:
                     mp3_file.write(chunk)
             mp3_path = mp3_file.name
-        # Normalize the volume
         audio_seg = AudioSegment.from_file(mp3_path, format="mp3")
         audio_seg = effects.normalize(audio_seg)
@@ -500,75 +434,69 @@ def generate_audio_mp3(text: str, speaker: str) -> str:
             os.remove(mp3_path)
         return final_mp3_path
     except Exception as e:
         print("[ERROR] Error generating audio:", e)
         raise ValueError(f"Error generating audio: {str(e)}")
 def transcribe_youtube_video_OLD_YTDLP(video_url: str) -> str:
-    """
-    Original ytdlp-based approach for local transcription. No longer used.
-    """
     pass
 def _preprocess_text_for_tts(text: str, speaker: str) -> str:
     """
-    Enhances text for natural-sounding TTS by inserting periods in uppercase
-    abbreviations (e.g. "AI" -> "A.I."), then removing them so TTS doesn't say 'dot'.
-    'SaaS' remains 'sass' as previously requested.
-    Example:
-      "AI"   -> displayed as "A.I." but TTS sees "A I"
-      "CIA"  -> displayed as "C.I.A." but TTS sees "C I A"
-      "F1"   -> displayed as "F.1." but TTS sees "F 1"
     """
-    # 1) Special case: "SaaS" => "sass"
-    #    We'll do this first so we don't insert periods for S-A-A-S inadvertently.
     text = re.sub(r"\b(?i)SaaS\b", "sass", text)
-    # 2) Insert periods in uppercase abbreviations (2+ letters/digits):
-    #    e.g. "AI" -> "A.I.", "CIA"->"C.I.A.", "F1"->"F.1."
-    def insert_periods_for_abbrev(match):
-        abbr = match.group(0)  # e.g. "CIA"
-        # Insert a period after each character: "C.I.A."
         parted = ".".join(list(abbr)) + "."
         return parted
-    # Insert periods for 2+ uppercase letters or digits, ignoring 'sass' we already replaced
     text = re.sub(r"\b([A-Z0-9]{2,})\b", insert_periods_for_abbrev, text)
-    # A small fix to remove double periods if they appear
-    text = re.sub(r"\.\.", ".", text)
-    # 3) Now remove those periods from TTS so it won't say 'dot'
-    #    "A.I." -> "A I", "C.I.A." -> "C I A", "F.1." -> "F 1"
-    def remove_periods_for_tts(match):
-        chunk = match.group(0)
-        # e.g. "C.I.A." => remove '.' => "C I A "
-        # Then strip trailing space => "C I A"
         return chunk.replace(".", " ").strip()
-    # Matches things like "A.I." or "C.I.A." or "F.1."
     text = re.sub(r"[A-Z0-9]\.[A-Z0-9](?:\.[A-Z0-9])*\.", remove_periods_for_tts, text)
-    # 4) Hyphens -> spaces
     text = re.sub(r"-", " ", text)
-    # 5) Convert decimals like 3.14 -> "three point one four"
     def convert_decimal(m):
         number_str = m.group()
         parts = number_str.split('.')
         whole_part = _spell_digits(parts[0])
         decimal_part = " ".join(_spell_digits(d) for d in parts[1])
         return f"{whole_part} point {decimal_part}"
-    text = re.sub(r"\d+\.\d+", convert_decimal, text)
-    # 6) Expand leftover all-caps abbreviations
-    #    e.g. NASA -> "N A S A", if not already dotted
     def expand_abbreviations(m):
         abbrev = m.group()
-        # If it's plural: e.g. "MPs" -> "M Peas"
         if abbrev.endswith('s') and abbrev[:-1].isupper():
             singular = abbrev[:-1]
             expanded = " ".join(list(singular)) + "s"
             special_plurals = {
@@ -579,10 +507,9 @@ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
             return special_plurals.get(abbrev, expanded)
         else:
             return " ".join(list(abbrev))
     text = re.sub(r"\b[A-Z]{2,}s?\b", expand_abbreviations, text)
-    # 7) Insert filler words if speaker != "Jane"
     if speaker != "Jane":
         def insert_thinking_pause(m):
             word = m.group(1)
@@ -591,7 +518,6 @@ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
                 return f"{word}..., {filler}"
             else:
                 return f"{word}...,"
         keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
         text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
@@ -609,31 +535,57 @@ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
     return text.strip()
-def _spell_digits(d: str) -> str:
     """
-    Convert digits e.g. '3' -> 'three'.
     """
-    digit_map = {
-        '0': 'zero',
-        '1': 'one',
-        '2': 'two',
-        '3': 'three',
-        '4': 'four',
-        '5': 'five',
-        '6': 'six',
-        '7': 'seven',
-        '8': 'eight',
-        '9': 'nine'
-    }
-    return " ".join(digit_map[ch] for ch in d if ch in digit_map)
 def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
-    """
-    Mixes 'spoken' with a default bg_music.mp3 or user-provided custom music:
-    1) Start with 2 seconds of music alone before speech begins.
-    2) Loop music if shorter than final audio length.
-    3) Lower music volume so speech is clear.
-    """
     if custom_music_path:
         music_path = custom_music_path
     else:
@@ -645,7 +597,6 @@ def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegm
         print("[ERROR] Failed to load background music:", e)
         return spoken
-    # Lower music volume
     bg_music = bg_music - 18.0
     total_length_ms = len(spoken) + 2000

 import random
 class DialogueItem(BaseModel):
+    speaker: Literal["Jane", "John"]   # TTS voice
     display_speaker: str = "Jane"      # For display in transcript
     text: str
 class Dialogue(BaseModel):
     dialogue: List[DialogueItem]
+# Initialize Whisper (unused for YouTube with RapidAPI)
 asr_pipeline = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-tiny.en",
 )
 def truncate_text(text, max_tokens=2048):
     print("[LOG] Truncating text if needed.")
     tokenizer = tiktoken.get_encoding("cl100k_base")
     tokens = tokenizer.encode(text)
     return text
 def extract_text_from_url(url):
     print("[LOG] Extracting text from URL:", url)
     try:
         headers = {
         return ""
 def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
     print(f"[LOG] Shifting pitch by {semitones} semitones.")
     new_sample_rate = int(audio.frame_rate * (2.0 ** (semitones / 12.0)))
     shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
     return shifted_audio.set_frame_rate(audio.frame_rate)
 def is_sufficient(text: str, min_word_count: int = 500) -> bool:
     word_count = len(text.split())
     print(f"[DEBUG] Aggregated word count: {word_count}")
     return word_count >= min_word_count
 def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
     print("[LOG] Querying LLM for additional information.")
     system_prompt = (
         "You are an AI assistant with extensive knowledge up to 2023-10. "
     return additional_info
 def research_topic(topic: str) -> str:
     sources = {
         "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
         "CNN": "http://rss.cnn.com/rss/edition.rss",
         "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
         "Google News - Custom": f"https://news.google.com/rss/search?q={requests.utils.quote(topic)}&hl=en-IN&gl=IN&ceid=IN:en",
     }
     summary_parts = []
     wiki_summary = fetch_wikipedia_summary(topic)
     if wiki_summary:
         summary_parts.append(f"From Wikipedia: {wiki_summary}")
     for name, feed_url in sources.items():
         try:
             items = fetch_rss_feed(feed_url)
     print("[DEBUG] Aggregated info from primary sources:")
     print(aggregated_info)
     if not is_sufficient(aggregated_info):
         print("[LOG] Insufficient info from primary sources. Fallback to LLM.")
         additional_info = query_llm_for_additional_info(topic, aggregated_info)
     return aggregated_info
 def fetch_wikipedia_summary(topic: str) -> str:
     print("[LOG] Fetching Wikipedia summary for:", topic)
     try:
         search_url = (
         return ""
 def fetch_rss_feed(feed_url: str) -> list:
     print("[LOG] Fetching RSS feed:", feed_url)
     try:
         resp = requests.get(feed_url)
         return []
 def find_relevant_article(items, topic: str, min_match=2) -> tuple:
     print("[LOG] Finding relevant articles...")
     keywords = re.findall(r'\w+', topic.lower())
     for item in items:
     return None, None, None
 def fetch_article_text(link: str) -> str:
     print("[LOG] Fetching article text from:", link)
     if not link:
         print("[LOG] No link provided for article text.")
     guest_name: str = "John",
     sponsor_style: str = "Separate Break"
 ):
     print("[LOG] Generating script with tone:", tone, "and length:", target_length)
     groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
     words_per_minute = 150
     numeric_minutes = 3
     match = re.search(r"(\d+)", target_length)
     }
     chosen_tone = tone_map.get(tone, "casual")
     if sponsor_style == "Separate Break":
         sponsor_instructions = (
             "If sponsor content is provided, include it in a separate ad break (~30 seconds). "
             "Use phrasing like 'Now a word from our sponsor...' and end with 'Back to the show' or similar."
         )
+    else:  # Blended
         sponsor_instructions = (
             "If sponsor content is provided, blend it naturally (~30 seconds) into the conversation. "
             "Avoid abrupt transitions."
         "    ]\n"
         "}"
     )
     print("[LOG] Sending prompt to Groq:")
     print(prompt)
         data = json.loads(json_str)
         dialogue_list = data.get("dialogue", [])
         for d in dialogue_list:
             raw_speaker = d.get("speaker", "Jane")
             if raw_speaker.lower() == host_name.lower():
                 d["speaker"] = "John"
                 d["display_speaker"] = guest_name
             else:
+                d["speaker"] = "Jane"
                 d["display_speaker"] = raw_speaker
         new_dialogue_items = []
         print("[ERROR] JSON decoding failed:", e)
         raise ValueError(f"Failed to parse dialogue: {str(e)}")
 def transcribe_youtube_video(video_url: str) -> str:
     print("[LOG] Transcribing YouTube video via RapidAPI:", video_url)
     video_id_match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", video_url)
     if not video_id_match:
         print("[LOG] Transcript retrieval successful.")
         print(f"[DEBUG] Transcript Length: {len(transcript_as_text)} characters.")
+        snippet = transcript_as_text[:200] + "..." if len(transcript_as_text) > 200 else transcript_as_text
         print(f"[DEBUG] Transcript Snippet: {snippet}")
         return transcript_as_text
 def generate_audio_mp3(text: str, speaker: str) -> str:
     """
     Calls Deepgram TTS with the text, returning a path to a temp MP3 file.
+    We also do some pre-processing for punctuation, abbreviations, numeric expansions, etc.
     """
     try:
         print(f"[LOG] Generating audio for speaker: {speaker}")
         processed_text = _preprocess_text_for_tts(text, speaker)
         deepgram_api_url = "https://api.deepgram.com/v1/speak"
         params = {
+            "model": "aura-asteria-en",  # female by default
         }
         if speaker == "John":
+            params["model"] = "aura-zeus-en"
         headers = {
             "Accept": "audio/mpeg",
                     mp3_file.write(chunk)
             mp3_path = mp3_file.name
+        # Normalize
         audio_seg = AudioSegment.from_file(mp3_path, format="mp3")
         audio_seg = effects.normalize(audio_seg)
             os.remove(mp3_path)
         return final_mp3_path
     except Exception as e:
         print("[ERROR] Error generating audio:", e)
         raise ValueError(f"Error generating audio: {str(e)}")
 def transcribe_youtube_video_OLD_YTDLP(video_url: str) -> str:
     pass
 def _preprocess_text_for_tts(text: str, speaker: str) -> str:
     """
+    1) "SaaS" => "sass"
+    2) Insert periods for uppercase abbreviations => remove them for TTS
+    3) Convert decimals "3.14" => "three point one four"
+    4) For pure integer numbers (e.g. "10", "2023") => "ten", "two thousand twenty three"
+    5) Expand leftover all-caps
+    6) Insert fillers if speaker != "Jane"
+    7) Remove random fillers
+    8) Capitalize sentence starts
     """
+    # 1) "SaaS" => "sass"
     text = re.sub(r"\b(?i)SaaS\b", "sass", text)
+    # 2) Insert periods for uppercase abbreviations of length >=2 => e.g. "CIA" -> "C.I.A."
+    def insert_periods_for_abbrev(m):
+        abbr = m.group(0)
         parted = ".".join(list(abbr)) + "."
         return parted
     text = re.sub(r"\b([A-Z0-9]{2,})\b", insert_periods_for_abbrev, text)
+    text = re.sub(r"\.\.", ".", text)  # remove double-dots
+    # 2b) Then remove those periods => TTS won't say "dot"
+    def remove_periods_for_tts(m):
+        chunk = m.group(0)
         return chunk.replace(".", " ").strip()
     text = re.sub(r"[A-Z0-9]\.[A-Z0-9](?:\.[A-Z0-9])*\.", remove_periods_for_tts, text)
+    # 3) Hyphens -> spaces
     text = re.sub(r"-", " ", text)
+    # 4) Convert decimals like "3.14" => "three point one four"
     def convert_decimal(m):
         number_str = m.group()
         parts = number_str.split('.')
         whole_part = _spell_digits(parts[0])
         decimal_part = " ".join(_spell_digits(d) for d in parts[1])
         return f"{whole_part} point {decimal_part}"
+    text = re.sub(r"\b\d+\.\d+\b", convert_decimal, text)
+    # 5) Convert pure integer numbers => e.g. "10" -> "ten", "42" -> "forty two"
+    #    We'll do a quick function for small-ish integers (up to 9999 for demo).
+    def convert_int_to_words(m):
+        num_str = m.group()
+        # e.g. "10" => 10 => "ten"
+        # "2023" => "two thousand twenty three"
+        # For brevity, handle up to 99999 or so. Or you can import "num2words" for a robust approach.
+        return number_to_words(int(num_str))
+    text = re.sub(r"\b\d+\b", convert_int_to_words, text)
+    # 6) Expand leftover all-caps abbreviations => "NASA" => "N A S A"
     def expand_abbreviations(m):
         abbrev = m.group()
         if abbrev.endswith('s') and abbrev[:-1].isupper():
+            # Plural e.g. "MPs" => "M Peas"
             singular = abbrev[:-1]
             expanded = " ".join(list(singular)) + "s"
             special_plurals = {
             return special_plurals.get(abbrev, expanded)
         else:
             return " ".join(list(abbrev))
     text = re.sub(r"\b[A-Z]{2,}s?\b", expand_abbreviations, text)
+    # 7) If speaker != Jane, insert filler words around certain keywords
     if speaker != "Jane":
         def insert_thinking_pause(m):
             word = m.group(1)
                 return f"{word}..., {filler}"
             else:
                 return f"{word}...,"
         keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
         text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
     return text.strip()
+def number_to_words(n: int) -> str:
     """
+    Very simple function to convert integers up to 99999 into words for TTS.
+    If you want a robust approach, consider the 'num2words' library.
     """
+    if n == 0:
+        return "zero"
+    if n < 0:
+        return "minus " + number_to_words(abs(n))
+    # Basic chunking
+    ones = ["","one","two","three","four","five","six","seven","eight","nine"]
+    teens = ["ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen"]
+    tens_words = ["","","twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety"]
+    words = []
+    def two_digit_word(x):
+        if x == 0:
+            return ""
+        if x < 10:
+            return ones[x]
+        if 10 <= x < 20:
+            return teens[x-10]
+        # 20+
+        tens_part = x // 10
+        ones_part = x % 10
+        return tens_words[tens_part] + (f" {ones[ones_part]}" if ones_part else "")
+    # Handle thousands
+    thousands = n // 1000
+    remainder = n % 1000
+    if thousands > 0:
+        words.append(two_digit_word(thousands))
+        words.append("thousand")
+    # Handle hundreds
+    hundreds = remainder // 100
+    last_two = remainder % 100
+    if hundreds > 0:
+        words.append(ones[hundreds])
+        words.append("hundred")
+    if last_two > 0:
+        if hundreds > 0 or thousands > 0:
+            words.append("and")
+        words.append(two_digit_word(last_two))
+    return " ".join(w for w in words if w).strip()
 def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
     if custom_music_path:
         music_path = custom_music_path
     else:
         print("[ERROR] Failed to load background music:", e)
         return spoken
     bg_music = bg_music - 18.0
     total_length_ms = len(spoken) + 2000