MyPod_10

Running

App Files Files Community

siddhartharyaai commited on Jan 13

Commit

84a3c5a

verified ·

1 Parent(s): 4df1c08

Update utils.py

Browse files

Files changed (1) hide show

utils.py +58 -16

utils.py CHANGED Viewed

@@ -17,8 +17,12 @@ import numpy as np
 import torch
 import random
 class DialogueItem(BaseModel):
-    speaker: Literal["Jane", "John"]
     text: str
 class Dialogue(BaseModel):
@@ -264,20 +268,28 @@ def fetch_article_text(link: str) -> str:
         print(f"[ERROR] Error fetching article text: {e}")
         return ""
-def generate_script(system_prompt: str, input_text: str, tone: str, target_length: str):
     """
     Sends the system_prompt plus input_text to the Groq LLM to generate a
     multi-speaker Dialogue in JSON. We parse and return it as a Dialogue object.
-    QUICK FIX ADDED:
-      - If the LLM returns speakers other than "Jane" or "John,"
-        we force them to "Jane" to satisfy the Pydantic literal constraint.
     """
     print("[LOG] Generating script with tone:", tone, "and length:", target_length)
     groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
-    # Instead of a fixed mapping, parse the numeric minutes from target_length if possible
-    # E.g. "3 Mins" -> 3 -> approximate word range
     words_per_minute = 150
     numeric_minutes = 3
     match = re.search(r"(\d+)", target_length)
@@ -337,14 +349,38 @@ def generate_script(system_prompt: str, input_text: str, tone: str, target_lengt
     json_str = raw_content[start_index:end_index+1].strip()
-    # --- QUICK FIX: Post-process to ensure only "Jane"/"John" as speakers ---
     try:
         data = json.loads(json_str)
-        for d in data.get("dialogue", []):
-            if d.get("speaker") not in ["Jane", "John"]:
-                d["speaker"] = "Jane"  # Force to "Jane" or "John" (you could alternate if desired)
-        return Dialogue(**data)
     except json.JSONDecodeError as e:
         print("[ERROR] JSON decoding (format) failed:", e)
@@ -353,7 +389,6 @@ def generate_script(system_prompt: str, input_text: str, tone: str, target_lengt
         print("[ERROR] JSON decoding failed:", e)
         raise ValueError(f"Failed to parse dialogue: {str(e)}")
-# REPLACE the YTDLP-based approach with the RapidAPI approach
 def transcribe_youtube_video(video_url: str) -> str:
     """
     Transcribe the given YouTube video by calling the RapidAPI 'youtube-transcriptor' endpoint.
@@ -425,8 +460,9 @@ def generate_audio_mp3(text: str, speaker: str) -> str:
         # Deepgram TTS endpoint
         deepgram_api_url = "https://api.deepgram.com/v1/speak"
         params = {
-            "model": "aura-asteria-en",  # default
         }
         if speaker == "John":
             params["model"] = "aura-zeus-en"
@@ -480,6 +516,8 @@ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
     Enhances text for natural-sounding TTS by handling abbreviations,
     punctuation, and intelligent filler insertion.
     Adjustments are made based on the speaker to optimize output quality.
     """
     # 1) Hyphens -> spaces
     text = re.sub(r"-", " ", text)
@@ -494,10 +532,14 @@ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
     text = re.sub(r"\d+\.\d+", convert_decimal, text)
-    # 3) Abbreviations (e.g., NASA -> N A S A)
     def expand_abbreviations(match):
         abbrev = match.group()
-        # Check if it's plural
         if abbrev.endswith('s') and abbrev[:-1].isupper():
             singular = abbrev[:-1]
             expanded = " ".join(list(singular)) + "s"

 import torch
 import random
+# ---------------------------------------------------------------------
+# Updated: DialogueItem now has an extra field `display_speaker`
+# ---------------------------------------------------------------------
 class DialogueItem(BaseModel):
+    speaker: Literal["Jane", "John"]           # Used internally for TTS voice
+    display_speaker: str = "Jane"             # The name shown in the user-facing transcript
     text: str
 class Dialogue(BaseModel):
         print(f"[ERROR] Error fetching article text: {e}")
         return ""
+# ---------------------------------------------------------------------
+# Pass host_name & guest_name so we can do "female voice" vs "male voice"
+# and display_speaker vs. speaker
+# ---------------------------------------------------------------------
+def generate_script(system_prompt: str, input_text: str, tone: str, target_length: str,
+                   host_name: str = "Jane", guest_name: str = "John"):
     """
     Sends the system_prompt plus input_text to the Groq LLM to generate a
     multi-speaker Dialogue in JSON. We parse and return it as a Dialogue object.
+    Logic:
+    - We parse the LLM's raw speaker name (e.g., "Angela", "Dimitris").
+    - If it matches the host_name, we set speaker="Jane" (female voice),
+      display_speaker = host_name.
+    - If it matches the guest_name, we set speaker="John" (male voice),
+      display_speaker = guest_name.
+    - If we can't match, default to "Jane" for speaker, but keep display_speaker as whatever LLM returned.
     """
     print("[LOG] Generating script with tone:", tone, "and length:", target_length)
     groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+    # Instead of a fixed mapping, parse numeric minutes from target_length if possible
     words_per_minute = 150
     numeric_minutes = 3
     match = re.search(r"(\d+)", target_length)
     json_str = raw_content[start_index:end_index+1].strip()
     try:
         data = json.loads(json_str)
+        dialogue_list = data.get("dialogue", [])
+        # Post-process to ensure correct TTS speaker + custom display name
+        for d in dialogue_list:
+            raw_speaker = d.get("speaker", "Jane")
+            text_line = d.get("text", "")
+            # If raw_speaker matches host_name (case-insensitive), speaker = "Jane"
+            if raw_speaker.lower() == host_name.lower():
+                d["speaker"] = "Jane"
+                d["display_speaker"] = host_name
+            # If raw_speaker matches guest_name, speaker = "John"
+            elif raw_speaker.lower() == guest_name.lower():
+                d["speaker"] = "John"
+                d["display_speaker"] = guest_name
+            else:
+                # Otherwise default: we assume it's host
+                d["speaker"] = "Jane"
+                d["display_speaker"] = raw_speaker  # keep the original name for display
+        # Now build the Dialogue object
+        # For any item that doesn't have display_speaker, fallback to "Jane"
+        new_dialogue_items = []
+        for d in dialogue_list:
+            if "display_speaker" not in d:
+                d["display_speaker"] = d["speaker"]  # fallback
+            # Convert dict -> DialogueItem
+            new_dialogue_items.append(DialogueItem(**d))
+        return Dialogue(dialogue=new_dialogue_items)
     except json.JSONDecodeError as e:
         print("[ERROR] JSON decoding (format) failed:", e)
         print("[ERROR] JSON decoding failed:", e)
         raise ValueError(f"Failed to parse dialogue: {str(e)}")
 def transcribe_youtube_video(video_url: str) -> str:
     """
     Transcribe the given YouTube video by calling the RapidAPI 'youtube-transcriptor' endpoint.
         # Deepgram TTS endpoint
         deepgram_api_url = "https://api.deepgram.com/v1/speak"
         params = {
+            "model": "aura-asteria-en",  # default female
         }
+        # If speaker == "John", use male voice
         if speaker == "John":
             params["model"] = "aura-zeus-en"
     Enhances text for natural-sounding TTS by handling abbreviations,
     punctuation, and intelligent filler insertion.
     Adjustments are made based on the speaker to optimize output quality.
+    New: We'll handle "SaaS" so that it is read as "S A A S".
     """
     # 1) Hyphens -> spaces
     text = re.sub(r"-", " ", text)
     text = re.sub(r"\d+\.\d+", convert_decimal, text)
+    # 3) Abbreviations (e.g., NASA -> N A S A).
+    #    We'll also handle "SaaS" -> "S A A S" specifically.
     def expand_abbreviations(match):
         abbrev = match.group()
+        # Special handling for "SaaS" -> "S A A S"
+        if abbrev.lower() == "saas":
+            return "S A A S"
+        # Check if it's plural with capital letters
         if abbrev.endswith('s') and abbrev[:-1].isupper():
             singular = abbrev[:-1]
             expanded = " ".join(list(singular)) + "s"