MyPod_10

Running

App Files Files Community

siddhartharyaai commited on Jan 14

Commit

2bcba5d

verified ·

1 Parent(s): 5198e6d

Update utils.py

Browse files

Files changed (1) hide show

utils.py +51 -101

utils.py CHANGED Viewed

@@ -18,14 +18,14 @@ import torch
 import random
 class DialogueItem(BaseModel):
-    speaker: Literal["Jane", "John"]   # TTS voice
-    display_speaker: str = "Jane"      # For display in transcript
     text: str
 class Dialogue(BaseModel):
     dialogue: List[DialogueItem]
-# Initialize Whisper (unused for YouTube with RapidAPI)
 asr_pipeline = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-tiny.en",
@@ -33,10 +33,6 @@ asr_pipeline = pipeline(
 )
 def truncate_text(text, max_tokens=2048):
-    """
-    If the text exceeds the max token limit (approx. 2,048), truncate it
-    to avoid exceeding the model's context window.
-    """
     print("[LOG] Truncating text if needed.")
     tokenizer = tiktoken.get_encoding("cl100k_base")
     tokens = tokenizer.encode(text)
@@ -46,10 +42,6 @@ def truncate_text(text, max_tokens=2048):
     return text
 def extract_text_from_url(url):
-    """
-    Fetches and extracts readable text from a given URL
-    (stripping out scripts, styles, etc.).
-    """
     print("[LOG] Extracting text from URL:", url)
     try:
         headers = {
@@ -74,29 +66,17 @@ def extract_text_from_url(url):
         return ""
 def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
-    """
-    Shifts the pitch of an AudioSegment by a given number of semitones.
-    Positive semitones shift the pitch up, negative shifts it down.
-    """
     print(f"[LOG] Shifting pitch by {semitones} semitones.")
     new_sample_rate = int(audio.frame_rate * (2.0 ** (semitones / 12.0)))
     shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
     return shifted_audio.set_frame_rate(audio.frame_rate)
 def is_sufficient(text: str, min_word_count: int = 500) -> bool:
-    """
-    Checks if the fetched text meets our sufficiency criteria
-    (e.g., at least 500 words).
-    """
     word_count = len(text.split())
     print(f"[DEBUG] Aggregated word count: {word_count}")
     return word_count >= min_word_count
 def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
-    """
-    Queries the Groq API to retrieve more info from the LLM's knowledge base.
-    Appends it to our aggregated info if found.
-    """
     print("[LOG] Querying LLM for additional information.")
     system_prompt = (
         "You are an AI assistant with extensive knowledge up to 2023-10. "
@@ -122,10 +102,6 @@ def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
     return additional_info
 def research_topic(topic: str) -> str:
-    """
-    Gathers info from various RSS feeds and Wikipedia. If needed, queries the LLM
-    for more data if the aggregated text is insufficient.
-    """
     sources = {
         "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
         "CNN": "http://rss.cnn.com/rss/edition.rss",
@@ -144,7 +120,6 @@ def research_topic(topic: str) -> str:
     if wiki_summary:
         summary_parts.append(f"From Wikipedia: {wiki_summary}")
-    # For each RSS feed
     for name, feed_url in sources.items():
         try:
             items = fetch_rss_feed(feed_url)
@@ -165,7 +140,6 @@ def research_topic(topic: str) -> str:
     print("[DEBUG] Aggregated info from primary sources:")
     print(aggregated_info)
-    # Fallback to LLM if insufficient
     if not is_sufficient(aggregated_info):
         print("[LOG] Insufficient info from primary sources. Fallback to LLM.")
         additional_info = query_llm_for_additional_info(topic, aggregated_info)
@@ -180,9 +154,6 @@ def research_topic(topic: str) -> str:
     return aggregated_info
 def fetch_wikipedia_summary(topic: str) -> str:
-    """
-    Fetch a quick Wikipedia summary of the topic via the official Wikipedia API.
-    """
     print("[LOG] Fetching Wikipedia summary for:", topic)
     try:
         search_url = (
@@ -209,9 +180,6 @@ def fetch_wikipedia_summary(topic: str) -> str:
         return ""
 def fetch_rss_feed(feed_url: str) -> list:
-    """
-    Pulls RSS feed data from a given URL and returns items.
-    """
     print("[LOG] Fetching RSS feed:", feed_url)
     try:
         resp = requests.get(feed_url)
@@ -226,10 +194,6 @@ def fetch_rss_feed(feed_url: str) -> list:
         return []
 def find_relevant_article(items, topic: str, min_match=2) -> tuple:
-    """
-    Check each article in the RSS feed for mention of the topic
-    by counting the number of keyword matches.
-    """
     print("[LOG] Finding relevant articles...")
     keywords = re.findall(r'\w+', topic.lower())
     for item in items:
@@ -244,9 +208,6 @@ def find_relevant_article(items, topic: str, min_match=2) -> tuple:
     return None, None, None
 def fetch_article_text(link: str) -> str:
-    """
-    Fetch the article text from the given link (first 5 paragraphs).
-    """
     print("[LOG] Fetching article text from:", link)
     if not link:
         print("[LOG] No link provided for article text.")
@@ -275,8 +236,8 @@ def generate_script(
     sponsor_style: str = "Separate Break"
 ):
     """
-    Sends the system_prompt plus input_text to the Groq LLM to generate a
-    multi-speaker Dialogue in JSON, returning a Dialogue object.
     """
     print("[LOG] Generating script with tone:", tone, "and length:", target_length)
     groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
@@ -423,7 +384,6 @@ def transcribe_youtube_video(video_url: str) -> str:
         print(f"[DEBUG] Transcript Snippet: {snippet}")
         return transcript_as_text
     except Exception as e:
         print("[ERROR] RapidAPI transcription error:", e)
         raise ValueError(f"Error transcribing YouTube video via RapidAPI: {str(e)}")
@@ -431,8 +391,7 @@ def transcribe_youtube_video(video_url: str) -> str:
 def generate_audio_mp3(text: str, speaker: str) -> str:
     """
     Calls Deepgram TTS with the text, returning a path to a temp MP3 file.
-    We also do some pre-processing for punctuation, abbreviations, numeric expansions,
-    plus emotive expressions (ha, sigh, etc.).
     """
     try:
         print(f"[LOG] Generating audio for speaker: {speaker}")
@@ -443,7 +402,7 @@ def generate_audio_mp3(text: str, speaker: str) -> str:
             "model": "aura-asteria-en",  # female by default
         }
         if speaker == "John":
-            params["model"] = "aura-helios-en"
         headers = {
             "Accept": "audio/mpeg",
@@ -468,7 +427,6 @@ def generate_audio_mp3(text: str, speaker: str) -> str:
                     mp3_file.write(chunk)
             mp3_path = mp3_file.name
-        # Normalize volume
         audio_seg = AudioSegment.from_file(mp3_path, format="mp3")
         audio_seg = effects.normalize(audio_seg)
@@ -489,26 +447,25 @@ def transcribe_youtube_video_OLD_YTDLP(video_url: str) -> str:
 def _preprocess_text_for_tts(text: str, speaker: str) -> str:
     """
     1) "SaaS" => "sass"
-    2) Insert periods in uppercase abbreviations -> remove for TTS
-    3) Convert decimals like "3.14" -> "three point one four"
-    4) Convert pure integer numbers like "20" -> "twenty"
     5) Expand leftover all-caps
     6) Emotive placeholders for 'ha', 'haha', 'sigh', 'groan', etc.
-    7) If speaker == "John", we insert short breath pauses only after punctuation
-    8) Remove random fillers
     9) Capitalize sentence starts
     """
     # 1) "SaaS" => "sass"
     text = re.sub(r"\b(?i)SaaS\b", "sass", text)
-    # 2) Insert periods in uppercase abbreviations, then remove them
     def insert_periods_for_abbrev(m):
         abbr = m.group(0)
         parted = ".".join(list(abbr)) + "."
         return parted
     text = re.sub(r"\b([A-Z0-9]{2,})\b", insert_periods_for_abbrev, text)
     text = re.sub(r"\.\.", ".", text)
     def remove_periods_for_tts(m):
         # "N.I.A." => "N I A"
         chunk = m.group(0)
@@ -527,13 +484,13 @@ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
         return f"{whole_part} point {decimal_part}"
     text = re.sub(r"\b\d+\.\d+\b", convert_decimal, text)
-    # 5) Convert pure integer => words
     def convert_int_to_words(m):
         num_str = m.group()
         return number_to_words(int(num_str))
     text = re.sub(r"\b\d+\b", convert_int_to_words, text)
-    # 6) Expand leftover all-caps => "NASA" => "N A S A"
     def expand_abbreviations(m):
         abbrev = m.group()
         if abbrev.endswith('s') and abbrev[:-1].isupper():
@@ -549,11 +506,15 @@ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
             return " ".join(list(abbrev))
     text = re.sub(r"\b[A-Z]{2,}s?\b", expand_abbreviations, text)
-    # 7) If speaker == "John", insert short breath pauses after punctuation
     if speaker == "John":
-        # Insert a short "..." after punctuation marks
-        text = re.sub(r"([.,!?;:])", r"\1...", text)
-        # Optionally remove random in-word pausing logic if you had it
     # 8) Remove random fillers
     text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
@@ -585,67 +546,58 @@ def _spell_digits(d: str) -> str:
 def number_to_words(n: int) -> str:
     """
-    Enhanced integer-to-words up to ~999,999 or more.
-    E.g., 10 -> 'ten', 4000 -> 'four thousand', 999999 -> 'nine hundred ninety nine thousand nine hundred ninety nine'
     """
     if n == 0:
         return "zero"
     if n < 0:
         return "minus " + number_to_words(-n)
-    # Up to 999,999 or so. Extend if you need more.
     ones = ["","one","two","three","four","five","six","seven","eight","nine"]
     teens = ["ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen"]
     tens_words = ["","","twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety"]
     def three_digits(x):
-        """ Convert 0 <= x < 1000 to words """
-        words = []
         hundreds = x // 100
-        remainder = x % 100
         if hundreds > 0:
-            words.append(ones[hundreds])
-            words.append("hundred")
-            if remainder > 0:
-                words.append("and")
-        if remainder < 10 and remainder > 0:
-            words.append(ones[remainder])
-        elif remainder >= 10 and remainder < 20:
-            words.append(teens[remainder-10])
         else:
-            t = remainder // 10
-            o = remainder % 10
             if t > 1:
-                words.append(tens_words[t])
             if o > 0:
-                words.append(ones[o])
-        return " ".join(w for w in words if w)
-    # We'll chunk up to 999,999
     thousands = n // 1000
     remainder = n % 1000
-    words_list = []
     if thousands > 0:
-        words_list.append(three_digits(thousands))
-        words_list.append("thousand")
     if remainder > 0:
-        words_list.append(three_digits(remainder))
-    final = " ".join(w for w in words_list if w).strip()
-    return final or "zero"
 def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
     """
-    Mixes 'spoken' with a default bg_music.mp3 or user-provided custom music:
-    1) Start with 2 seconds of music alone before speech begins.
-    2) Loop music if shorter than the final audio length.
-    3) Lower music volume so the speech is clear.
     """
     if custom_music_path:
         music_path = custom_music_path
@@ -669,12 +621,10 @@ def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegm
     final_mix = looped_music.overlay(spoken, position=2000)
     return final_mix
-# This function is new for short Q&A calls
 def call_groq_api_for_qa(system_prompt: str) -> str:
     """
-    A minimal placeholder for your short Q&A LLM call.
-    Must return a JSON string, e.g.:
-    {"speaker": "John", "text": "Short answer here"}
     """
     groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
     try:

 import random
 class DialogueItem(BaseModel):
+    speaker: Literal["Jane", "John"]
+    display_speaker: str = "Jane"
     text: str
 class Dialogue(BaseModel):
     dialogue: List[DialogueItem]
+# Not used for YouTube, but for local if needed
 asr_pipeline = pipeline(
     "automatic-speech-recognition",
     model="openai/whisper-tiny.en",
 )
 def truncate_text(text, max_tokens=2048):
     print("[LOG] Truncating text if needed.")
     tokenizer = tiktoken.get_encoding("cl100k_base")
     tokens = tokenizer.encode(text)
     return text
 def extract_text_from_url(url):
     print("[LOG] Extracting text from URL:", url)
     try:
         headers = {
         return ""
 def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
     print(f"[LOG] Shifting pitch by {semitones} semitones.")
     new_sample_rate = int(audio.frame_rate * (2.0 ** (semitones / 12.0)))
     shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
     return shifted_audio.set_frame_rate(audio.frame_rate)
 def is_sufficient(text: str, min_word_count: int = 500) -> bool:
     word_count = len(text.split())
     print(f"[DEBUG] Aggregated word count: {word_count}")
     return word_count >= min_word_count
 def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
     print("[LOG] Querying LLM for additional information.")
     system_prompt = (
         "You are an AI assistant with extensive knowledge up to 2023-10. "
     return additional_info
 def research_topic(topic: str) -> str:
     sources = {
         "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
         "CNN": "http://rss.cnn.com/rss/edition.rss",
     if wiki_summary:
         summary_parts.append(f"From Wikipedia: {wiki_summary}")
     for name, feed_url in sources.items():
         try:
             items = fetch_rss_feed(feed_url)
     print("[DEBUG] Aggregated info from primary sources:")
     print(aggregated_info)
     if not is_sufficient(aggregated_info):
         print("[LOG] Insufficient info from primary sources. Fallback to LLM.")
         additional_info = query_llm_for_additional_info(topic, aggregated_info)
     return aggregated_info
 def fetch_wikipedia_summary(topic: str) -> str:
     print("[LOG] Fetching Wikipedia summary for:", topic)
     try:
         search_url = (
         return ""
 def fetch_rss_feed(feed_url: str) -> list:
     print("[LOG] Fetching RSS feed:", feed_url)
     try:
         resp = requests.get(feed_url)
         return []
 def find_relevant_article(items, topic: str, min_match=2) -> tuple:
     print("[LOG] Finding relevant articles...")
     keywords = re.findall(r'\w+', topic.lower())
     for item in items:
     return None, None, None
 def fetch_article_text(link: str) -> str:
     print("[LOG] Fetching article text from:", link)
     if not link:
         print("[LOG] No link provided for article text.")
     sponsor_style: str = "Separate Break"
 ):
     """
+    If sponsor content is empty, we won't have sponsor instructions appended in app.py's prompt.
+    So the LLM should not generate sponsor segments.
     """
     print("[LOG] Generating script with tone:", tone, "and length:", target_length)
     groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
         print(f"[DEBUG] Transcript Snippet: {snippet}")
         return transcript_as_text
     except Exception as e:
         print("[ERROR] RapidAPI transcription error:", e)
         raise ValueError(f"Error transcribing YouTube video via RapidAPI: {str(e)}")
 def generate_audio_mp3(text: str, speaker: str) -> str:
     """
     Calls Deepgram TTS with the text, returning a path to a temp MP3 file.
+    Then we do normal volume normalization, etc.
     """
     try:
         print(f"[LOG] Generating audio for speaker: {speaker}")
             "model": "aura-asteria-en",  # female by default
         }
         if speaker == "John":
+            params["model"] = "aura-zeus-en"
         headers = {
             "Accept": "audio/mpeg",
                     mp3_file.write(chunk)
             mp3_path = mp3_file.name
         audio_seg = AudioSegment.from_file(mp3_path, format="mp3")
         audio_seg = effects.normalize(audio_seg)
 def _preprocess_text_for_tts(text: str, speaker: str) -> str:
     """
     1) "SaaS" => "sass"
+    2) Insert periods for uppercase abbreviations -> remove for TTS (N.I.A. => N I A)
+    3) Convert decimals (3.14 => 'three point one four')
+    4) Convert integers (10 => 'ten', 4000 => 'four thousand')
     5) Expand leftover all-caps
     6) Emotive placeholders for 'ha', 'haha', 'sigh', 'groan', etc.
+    7) If speaker == "John", insert short breath "..." after punctuation (not random mid-word)
+    8) Remove random fillers (uh, um)
     9) Capitalize sentence starts
     """
     # 1) "SaaS" => "sass"
     text = re.sub(r"\b(?i)SaaS\b", "sass", text)
+    # 2) Insert periods for uppercase abbreviations => remove them
     def insert_periods_for_abbrev(m):
         abbr = m.group(0)
         parted = ".".join(list(abbr)) + "."
         return parted
     text = re.sub(r"\b([A-Z0-9]{2,})\b", insert_periods_for_abbrev, text)
     text = re.sub(r"\.\.", ".", text)
     def remove_periods_for_tts(m):
         # "N.I.A." => "N I A"
         chunk = m.group(0)
         return f"{whole_part} point {decimal_part}"
     text = re.sub(r"\b\d+\.\d+\b", convert_decimal, text)
+    # Convert pure integers => words
     def convert_int_to_words(m):
         num_str = m.group()
         return number_to_words(int(num_str))
     text = re.sub(r"\b\d+\b", convert_int_to_words, text)
+    # 5) Expand leftover all-caps => "NASA" => "N A S A"
     def expand_abbreviations(m):
         abbrev = m.group()
         if abbrev.endswith('s') and abbrev[:-1].isupper():
             return " ".join(list(abbrev))
     text = re.sub(r"\b[A-Z]{2,}s?\b", expand_abbreviations, text)
+    # 6) Emotive placeholders
+    text = re.sub(r"\b(ha(ha)?|heh|lol)\b", "(* laughs *)", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bsigh\b", "(* sighs *)", text, flags=re.IGNORECASE)
+    text = re.sub(r"\b(groan|moan)\b", "(* groans *)", text, flags=re.IGNORECASE)
+    # 7) If speaker == "John", place short "..." after punctuation only
     if speaker == "John":
+        # Insert a short "..." after . , ! ? ; :
+        text = re.sub(r"([.,!?;:])(\s|$)", r"\1...\2", text)
     # 8) Remove random fillers
     text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
 def number_to_words(n: int) -> str:
     """
+    Enhanced integer-to-words up to 999,999 so '10' => 'ten', '4000' => 'four thousand'.
     """
     if n == 0:
         return "zero"
     if n < 0:
         return "minus " + number_to_words(-n)
     ones = ["","one","two","three","four","five","six","seven","eight","nine"]
     teens = ["ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen"]
     tens_words = ["","","twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety"]
     def three_digits(x):
+        w = []
         hundreds = x // 100
+        rem = x % 100
         if hundreds > 0:
+            w.append(ones[hundreds])
+            w.append("hundred")
+            if rem > 0:
+                w.append("and")
+        if rem < 10 and rem > 0:
+            w.append(ones[rem])
+        elif rem >= 10 and rem < 20:
+            w.append(teens[rem - 10])
         else:
+            t = rem // 10
+            o = rem % 10
             if t > 1:
+                w.append(tens_words[t])
             if o > 0:
+                w.append(ones[o])
+        return " ".join(i for i in w if i)
     thousands = n // 1000
     remainder = n % 1000
+    parts = []
     if thousands > 0:
+        parts.append(three_digits(thousands))
+        parts.append("thousand")
     if remainder > 0:
+        parts.append(three_digits(remainder))
+    out = " ".join(i for i in parts if i).strip()
+    return out or "zero"
 def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
     """
+    Mixes 'spoken' with bg_music.mp3 or custom music:
+    - 2s lead-in
+    - Loop if shorter
+    - Lower volume
     """
     if custom_music_path:
         music_path = custom_music_path
     final_mix = looped_music.overlay(spoken, position=2000)
     return final_mix
 def call_groq_api_for_qa(system_prompt: str) -> str:
     """
+    Minimal function for short Q&A calls. Must return JSON:
+    { "speaker": "John", "text": "Short answer" }
     """
     groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
     try: