MyPod_10

Running

App Files Files Community

siddhartharyaai commited on Jan 15

Commit

7db8b55

verified ·

1 Parent(s): dfa6ba8

Update utils.py

Browse files

Files changed (1) hide show

utils.py +139 -113

utils.py CHANGED Viewed

@@ -16,7 +16,6 @@ from groq import Groq
 import numpy as np
 import torch
 import random
-from num2words import num2words  # For robust number-to-words conversion
 class DialogueItem(BaseModel):
     speaker: Literal["Jane", "John"]   # TTS voice
@@ -266,36 +265,56 @@ def fetch_article_text(link: str) -> str:
         print(f"[ERROR] Error fetching article text: {e}")
         return ""
-# Updated generate_script signature to accept extra arguments without using them
-def generate_script(system_prompt: str, input_text: str, tone: str, target_length: str,
-                    host_name: str = "Jane", guest_name: str = "John",
-                    sponsor_style: str = "Separate Break", sponsor_provided: bool = False):
     print("[LOG] Generating script with tone:", tone, "and length:", target_length)
     groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
-    # Map length string to word ranges
-    length_mapping = {
-        "1-3 Mins": (200, 450),
-        "3-5 Mins": (450, 750),
-        "5-10 Mins": (750, 1500),
-        "10-20 Mins": (1500, 3000)
-    }
-    min_words, max_words = length_mapping.get(target_length, (200, 450))
-    tone_description = {
         "Humorous": "funny and exciting, makes people chuckle",
         "Formal": "business-like, well-structured, professional",
         "Casual": "like a conversation between close friends, relaxed and informal",
         "Youthful": "like how teenagers might chat, energetic and lively"
     }
-    chosen_tone = tone_description.get(tone, "casual")
-    # Construct prompt
     prompt = (
         f"{system_prompt}\n"
         f"TONE: {chosen_tone}\n"
-        f"TARGET LENGTH: {target_length} ({min_words}-{max_words} words)\n"
         f"INPUT TEXT: {input_text}\n\n"
         "Please provide the output in the following JSON format without any additional text:\n\n"
         "{\n"
         '    "dialogue": [\n'
@@ -325,33 +344,46 @@ def generate_script(system_prompt: str, input_text: str, tone: str, target_lengt
         raise ValueError(f"Error communicating with Groq API: {str(e)}")
     raw_content = response.choices[0].message.content.strip()
-    # Attempt to parse JSON
     start_index = raw_content.find('{')
     end_index = raw_content.rfind('}')
     if start_index == -1 or end_index == -1:
         raise ValueError("Failed to parse dialogue: No JSON found.")
     json_str = raw_content[start_index:end_index+1].strip()
     try:
         data = json.loads(json_str)
-        return Dialogue(**data)
     except Exception as e:
         print("[ERROR] JSON decoding failed:", e)
         raise ValueError(f"Failed to parse dialogue: {str(e)}")
-# ----------------------------------------------------------------------
-# REPLACE the YTDLP-based approach with the RapidAPI approach
-# ----------------------------------------------------------------------
 def transcribe_youtube_video(video_url: str) -> str:
-    """
-    Transcribe the given YouTube video by calling the RapidAPI 'youtube-transcriptor' endpoint.
-    1) Extract the 11-char video ID from the YouTube URL.
-    2) Call the RapidAPI endpoint (lang=en).
-    3) Parse and extract 'transcriptionAsText' from the response.
-    4) Return that transcript as a string.
-    """
     print("[LOG] Transcribing YouTube video via RapidAPI:", video_url)
-    # Extract video ID
     video_id_match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", video_url)
     if not video_id_match:
         raise ValueError(f"Invalid YouTube URL: {video_url}, cannot extract video ID.")
@@ -372,7 +404,7 @@ def transcribe_youtube_video(video_url: str) -> str:
     try:
         response = requests.get(base_url, headers=headers, params=params, timeout=30)
         print("[LOG] RapidAPI Response Status Code:", response.status_code)
-        print("[LOG] RapidAPI Response Body:", response.text)  # Log the full response
         if response.status_code != 200:
             raise ValueError(f"RapidAPI transcription error: {response.status_code}, {response.text}")
@@ -381,19 +413,13 @@ def transcribe_youtube_video(video_url: str) -> str:
         if not isinstance(data, list) or not data:
             raise ValueError(f"Unexpected transcript format or empty transcript: {data}")
-        # Extract 'transcriptionAsText'
         transcript_as_text = data[0].get('transcriptionAsText', '').strip()
         if not transcript_as_text:
             raise ValueError("transcriptionAsText field is missing or empty.")
         print("[LOG] Transcript retrieval successful.")
         print(f"[DEBUG] Transcript Length: {len(transcript_as_text)} characters.")
-        # Optionally, print a snippet of the transcript
-        if len(transcript_as_text) > 200:
-            snippet = transcript_as_text[:200] + "..."
-        else:
-            snippet = transcript_as_text
         print(f"[DEBUG] Transcript Snippet: {snippet}")
         return transcript_as_text
@@ -405,17 +431,16 @@ def transcribe_youtube_video(video_url: str) -> str:
 def generate_audio_mp3(text: str, speaker: str) -> str:
     """
     Calls Deepgram TTS with the text, returning a path to a temp MP3 file.
-    We also do some pre-processing for punctuation, abbreviations, etc.
     """
     try:
         print(f"[LOG] Generating audio for speaker: {speaker}")
-        # Preprocess text with speaker context
         processed_text = _preprocess_text_for_tts(text, speaker)
         deepgram_api_url = "https://api.deepgram.com/v1/speak"
         params = {
-            "model": "aura-asteria-en",  # default
         }
         if speaker == "John":
             params["model"] = "aura-zeus-en"
@@ -459,94 +484,73 @@ def generate_audio_mp3(text: str, speaker: str) -> str:
         raise ValueError(f"Error generating audio: {str(e)}")
 def transcribe_youtube_video_OLD_YTDLP(video_url: str) -> str:
-    """
-    Original ytdlp-based approach for local transcription.
-    No longer used, but kept for reference.
-    """
     pass
-# ---------------------------------------------------------------------
-# TEXT PRE-PROCESSING FOR NATURAL TTS (punctuation, abbreviations, etc.)
-# ---------------------------------------------------------------------
 def _preprocess_text_for_tts(text: str, speaker: str) -> str:
     """
-    Enhances text for natural-sounding TTS by handling abbreviations,
-    punctuation, and intelligent filler insertion.
-    Adjustments are made based on the speaker to optimize output quality.
-    """
-    # 1) Hyphens -> spaces
     text = re.sub(r"-", " ", text)
-    # 2) Convert decimals (e.g., 3.14 -> 'three point one four')
-    def convert_decimal(m):
-        number_str = m.group()
-        parts = number_str.split('.')
-        whole_part = _spell_digits(parts[0])
-        decimal_part = " ".join(_spell_digits(d) for d in parts[1])
-        return f"{whole_part} point {decimal_part}"
-    text = re.sub(r"\d+\.\d+", convert_decimal, text)
-    # 3) Abbreviations (e.g., NASA -> N A S A, MPs -> M Peas)
-    def expand_abbreviations(match):
-        abbrev = match.group()
-        # Check if it's a plural abbreviation
-        if abbrev.endswith('s') and abbrev[:-1].isupper():
-            singular = abbrev[:-1]
-            expanded = " ".join(list(singular)) + "s"  # Append 's' to the expanded form
-            # Handle specific plural forms
-            specific_plural = {
-                "MPs": "M Peas",
-                "TMTs": "T M Tees",
-                "ARJs": "A R Jays",
-                # Add more as needed
-            }
-            return specific_plural.get(abbrev, expanded)
-        else:
-            return " ".join(list(abbrev))
-    # Regex to match abbreviations (all uppercase letters, possibly ending with 's')
-    text = re.sub(r"\b[A-Z]{2,}s?\b", expand_abbreviations, text)
-    # 4) Removed ellipsis insertion after punctuation to reduce long pauses
-    # These lines have been removed:
-    # text = re.sub(r"\.(\s|$)", r"...\1", text)
-    # text = re.sub(r",(\s|$)", r",...\1", text)
-    # text = re.sub(r"\?(\s|$)", r"?...\1", text)
-    # 5) Intelligent filler insertion after specific keywords (skip for Jane)
     if speaker != "Jane":
         def insert_thinking_pause(m):
             word = m.group(1)
-            # Decide randomly whether to insert a filler
-            if random.random() < 0.3:  # 30% chance
                 filler = random.choice(['hmm,', 'well,', 'let me see,'])
                 return f"{word}..., {filler}"
             else:
                 return f"{word}...,"
         keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
         text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
-    # 6) Insert dynamic pauses within sentences (e.g., after conjunctions) for non-Jane speakers
-    if speaker != "Jane":
-        conjunctions_pattern = r"\b(and|but|so|because|however)\b"
-        text = re.sub(conjunctions_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
-    # 7) Remove any unintended random fillers (safeguard)
     text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
-    # 8) Ensure normal grammar and speaking style
-    def capitalize_match(match):
-        return match.group().upper()
     text = re.sub(r'(^\s*\w)|([.!?]\s*\w)', capitalize_match, text)
     return text.strip()
 def _spell_digits(d: str) -> str:
     """
-    Convert digits '3' -> 'three', etc.
     """
     digit_map = {
         '0': 'zero',
@@ -562,23 +566,25 @@ def _spell_digits(d: str) -> str:
     }
     return " ".join(digit_map[ch] for ch in d if ch in digit_map)
-def mix_with_bg_music(spoken: AudioSegment) -> AudioSegment:
     """
-    Mixes 'spoken' with bg_music.mp3 in the root folder:
     1) Start with 2 seconds of music alone before speech begins.
     2) Loop the music if it's shorter than the final audio length.
-    3) Lower the music volume so the speech is clear.
     """
-    bg_music_path = "bg_music.mp3"  # in root folder
     try:
-        bg_music = AudioSegment.from_file(bg_music_path, format="mp3")
     except Exception as e:
         print("[ERROR] Failed to load background music:", e)
         return spoken
-    # Reduce background music volume further
-    bg_music = bg_music - 18.0  # Lower volume (e.g. -18 dB)
     total_length_ms = len(spoken) + 2000
     looped_music = AudioSegment.empty()
@@ -586,8 +592,28 @@ def mix_with_bg_music(spoken: AudioSegment) -> AudioSegment:
         looped_music += bg_music
     looped_music = looped_music[:total_length_ms]
-    # Overlay spoken at 2000ms so we get 2s of music first
     final_mix = looped_music.overlay(spoken, position=2000)
     return final_mix

 import numpy as np
 import torch
 import random
 class DialogueItem(BaseModel):
     speaker: Literal["Jane", "John"]   # TTS voice
         print(f"[ERROR] Error fetching article text: {e}")
         return ""
+def generate_script(
+    system_prompt: str,
+    input_text: str,
+    tone: str,
+    target_length: str,
+    host_name: str = "Jane",
+    guest_name: str = "John",
+    sponsor_style: str = "Separate Break"
+):
+    """
+    Sends the system_prompt plus input_text to the Groq LLM to generate a
+    multi-speaker Dialogue in JSON, returning a Dialogue object.
+    """
     print("[LOG] Generating script with tone:", tone, "and length:", target_length)
     groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+    words_per_minute = 150
+    numeric_minutes = 3
+    match = re.search(r"(\d+)", target_length)
+    if match:
+        numeric_minutes = int(match.group(1))
+    min_words = max(50, numeric_minutes * 100)
+    max_words = numeric_minutes * words_per_minute
+    tone_map = {
         "Humorous": "funny and exciting, makes people chuckle",
         "Formal": "business-like, well-structured, professional",
         "Casual": "like a conversation between close friends, relaxed and informal",
         "Youthful": "like how teenagers might chat, energetic and lively"
     }
+    chosen_tone = tone_map.get(tone, "casual")
+    if sponsor_style == "Separate Break":
+        sponsor_instructions = (
+            "If sponsor content is provided, include it in a separate ad break (~30 seconds). "
+            "Use phrasing like 'Now a word from our sponsor...' and end with 'Back to the show' or similar."
+        )
+    else:
+        sponsor_instructions = (
+            "If sponsor content is provided, blend it naturally (~30 seconds) into the conversation. "
+            "Avoid abrupt transitions."
+        )
     prompt = (
         f"{system_prompt}\n"
         f"TONE: {chosen_tone}\n"
+        f"TARGET LENGTH: {target_length} (~{min_words}-{max_words} words)\n"
         f"INPUT TEXT: {input_text}\n\n"
+        f"# Sponsor Style Instruction:\n{sponsor_instructions}\n\n"
         "Please provide the output in the following JSON format without any additional text:\n\n"
         "{\n"
         '    "dialogue": [\n'
         raise ValueError(f"Error communicating with Groq API: {str(e)}")
     raw_content = response.choices[0].message.content.strip()
     start_index = raw_content.find('{')
     end_index = raw_content.rfind('}')
     if start_index == -1 or end_index == -1:
         raise ValueError("Failed to parse dialogue: No JSON found.")
     json_str = raw_content[start_index:end_index+1].strip()
     try:
         data = json.loads(json_str)
+        dialogue_list = data.get("dialogue", [])
+        for d in dialogue_list:
+            raw_speaker = d.get("speaker", "Jane")
+            if raw_speaker.lower() == host_name.lower():
+                d["speaker"] = "Jane"
+                d["display_speaker"] = host_name
+            elif raw_speaker.lower() == guest_name.lower():
+                d["speaker"] = "John"
+                d["display_speaker"] = guest_name
+            else:
+                d["speaker"] = "Jane"
+                d["display_speaker"] = raw_speaker
+        new_dialogue_items = []
+        for d in dialogue_list:
+            if "display_speaker" not in d:
+                d["display_speaker"] = d["speaker"]
+            new_dialogue_items.append(DialogueItem(**d))
+        return Dialogue(dialogue=new_dialogue_items)
+    except json.JSONDecodeError as e:
+        print("[ERROR] JSON decoding (format) failed:", e)
+        raise ValueError(f"Failed to parse dialogue: {str(e)}")
     except Exception as e:
         print("[ERROR] JSON decoding failed:", e)
         raise ValueError(f"Failed to parse dialogue: {str(e)}")
 def transcribe_youtube_video(video_url: str) -> str:
     print("[LOG] Transcribing YouTube video via RapidAPI:", video_url)
     video_id_match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", video_url)
     if not video_id_match:
         raise ValueError(f"Invalid YouTube URL: {video_url}, cannot extract video ID.")
     try:
         response = requests.get(base_url, headers=headers, params=params, timeout=30)
         print("[LOG] RapidAPI Response Status Code:", response.status_code)
+        print("[LOG] RapidAPI Response Body:", response.text)
         if response.status_code != 200:
             raise ValueError(f"RapidAPI transcription error: {response.status_code}, {response.text}")
         if not isinstance(data, list) or not data:
             raise ValueError(f"Unexpected transcript format or empty transcript: {data}")
         transcript_as_text = data[0].get('transcriptionAsText', '').strip()
         if not transcript_as_text:
             raise ValueError("transcriptionAsText field is missing or empty.")
         print("[LOG] Transcript retrieval successful.")
         print(f"[DEBUG] Transcript Length: {len(transcript_as_text)} characters.")
+        snippet = transcript_as_text[:200] + "..." if len(transcript_as_text) > 200 else transcript_as_text
         print(f"[DEBUG] Transcript Snippet: {snippet}")
         return transcript_as_text
 def generate_audio_mp3(text: str, speaker: str) -> str:
     """
     Calls Deepgram TTS with the text, returning a path to a temp MP3 file.
+    We also do some pre-processing for punctuation, abbreviations, numeric expansions,
+    plus emotive expressions (ha, sigh, etc.).
     """
     try:
         print(f"[LOG] Generating audio for speaker: {speaker}")
         processed_text = _preprocess_text_for_tts(text, speaker)
         deepgram_api_url = "https://api.deepgram.com/v1/speak"
         params = {
+            "model": "aura-asteria-en",  # female by default
         }
         if speaker == "John":
             params["model"] = "aura-zeus-en"
         raise ValueError(f"Error generating audio: {str(e)}")
 def transcribe_youtube_video_OLD_YTDLP(video_url: str) -> str:
     pass
 def _preprocess_text_for_tts(text: str, speaker: str) -> str:
     """
+    1) "SaaS" => "sass"
+    2) Insert periods for uppercase abbreviations -> remove for TTS
+    3) Convert decimals like "3.14" -> "three point one four"
+    4) Convert pure integer numbers like "20" -> "twenty"
+    5) Expand leftover all-caps
+    6) Emotive placeholders for 'ha', 'haha', 'sigh', 'groan', etc.
+    7) If speaker != Jane, insert filler words
+    8) Remove random fillers
+    9) Capitalize sentence starts
+    """
+    # 1) "SaaS" => "sass"
+    text = re.sub(r"\b(?i)SaaS\b", "sass", text)
+    # 2) Insert periods in uppercase abbreviations (>=2 chars), then remove them
+    def insert_periods_for_abbrev(m):
+        abbr = m.group(0)
+        parted = ".".join(list(abbr)) + "."
+        return parted
+    text = re.sub(r"\b([A-Z0-9]{2,})\b", insert_periods_for_abbrev, text)
+    text = re.sub(r"\.\.", ".", text)
+    def remove_periods_for_tts(m):
+        chunk = m.group(0)
+        return chunk.replace(".", " ").strip()
+    text = re.sub(r"[A-Z0-9]\.[A-Z0-9](?:\.[A-Z0-9])*\.", remove_periods_for_tts, text)
+    # 3) Hyphens -> spaces
     text = re.sub(r"-", " ", text)
+    # Removed numeric conversions to let TTS handle numbers naturally.
+    # 6) Emotive placeholders
+    text = re.sub(r"\b(ha(ha)?|heh|lol)\b", "(* laughs *)", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bsigh\b", "(* sighs *)", text, flags=re.IGNORECASE)
+    text = re.sub(r"\b(groan|moan)\b", "(* groans *)", text, flags=re.IGNORECASE)
+    # 7) Insert filler words if speaker != "Jane"
     if speaker != "Jane":
         def insert_thinking_pause(m):
             word = m.group(1)
+            if random.random() < 0.3:
                 filler = random.choice(['hmm,', 'well,', 'let me see,'])
                 return f"{word}..., {filler}"
             else:
                 return f"{word}...,"
         keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
         text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
+        conj_pattern = r"\b(and|but|so|because|however)\b"
+        text = re.sub(conj_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
+    # 8) Remove random fillers
     text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
+    # 9) Capitalize sentence starts
+    def capitalize_match(m):
+        return m.group().upper()
     text = re.sub(r'(^\s*\w)|([.!?]\s*\w)', capitalize_match, text)
     return text.strip()
 def _spell_digits(d: str) -> str:
     """
+    Convert individual digits '3' -> 'three'.
     """
     digit_map = {
         '0': 'zero',
     }
     return " ".join(digit_map[ch] for ch in d if ch in digit_map)
+def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
     """
+    Mixes 'spoken' with a default bg_music.mp3 or user-provided custom music:
     1) Start with 2 seconds of music alone before speech begins.
     2) Loop the music if it's shorter than the final audio length.
+    3) Lower music volume so the speech is clear.
     """
+    if custom_music_path:
+        music_path = custom_music_path
+    else:
+        music_path = "bg_music.mp3"
     try:
+        bg_music = AudioSegment.from_file(music_path, format="mp3")
     except Exception as e:
         print("[ERROR] Failed to load background music:", e)
         return spoken
+    bg_music = bg_music - 18.0
     total_length_ms = len(spoken) + 2000
     looped_music = AudioSegment.empty()
         looped_music += bg_music
     looped_music = looped_music[:total_length_ms]
     final_mix = looped_music.overlay(spoken, position=2000)
     return final_mix
+# This function is new for short Q&A calls
+def call_groq_api_for_qa(system_prompt: str) -> str:
+    """
+    A minimal placeholder for your short Q&A LLM call.
+    Must return a JSON string, e.g.:
+    {"speaker": "John", "text": "Short answer here"}
+    """
+    groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+    try:
+        response = groq_client.chat.completions.create(
+            messages=[{"role": "system", "content": system_prompt}],
+            model="llama-3.3-70b-versatile",
+            max_tokens=512,
+            temperature=0.7
+        )
+    except Exception as e:
+        print("[ERROR] Groq API error:", e)
+        fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering right now."}
+        return json.dumps(fallback)
+    raw_content = response.choices[0].message.content.strip()
+    return raw_content