MyPod_10

Running

App Files Files Community

siddhartharyaai commited on Jan 13

Commit

aacfe72

verified ·

1 Parent(s): e4e7996

Update utils.py

Browse files

Files changed (1) hide show

utils.py +89 -35

utils.py CHANGED Viewed

@@ -33,6 +33,10 @@ asr_pipeline = pipeline(
 )
 def truncate_text(text, max_tokens=2048):
     print("[LOG] Truncating text if needed.")
     tokenizer = tiktoken.get_encoding("cl100k_base")
     tokens = tokenizer.encode(text)
@@ -42,6 +46,10 @@ def truncate_text(text, max_tokens=2048):
     return text
 def extract_text_from_url(url):
     print("[LOG] Extracting text from URL:", url)
     try:
         headers = {
@@ -66,17 +74,29 @@ def extract_text_from_url(url):
         return ""
 def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
     print(f"[LOG] Shifting pitch by {semitones} semitones.")
     new_sample_rate = int(audio.frame_rate * (2.0 ** (semitones / 12.0)))
     shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
     return shifted_audio.set_frame_rate(audio.frame_rate)
 def is_sufficient(text: str, min_word_count: int = 500) -> bool:
     word_count = len(text.split())
     print(f"[DEBUG] Aggregated word count: {word_count}")
     return word_count >= min_word_count
 def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
     print("[LOG] Querying LLM for additional information.")
     system_prompt = (
         "You are an AI assistant with extensive knowledge up to 2023-10. "
@@ -102,6 +122,10 @@ def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
     return additional_info
 def research_topic(topic: str) -> str:
     sources = {
         "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
         "CNN": "http://rss.cnn.com/rss/edition.rss",
@@ -112,12 +136,15 @@ def research_topic(topic: str) -> str:
         "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
         "Google News - Custom": f"https://news.google.com/rss/search?q={requests.utils.quote(topic)}&hl=en-IN&gl=IN&ceid=IN:en",
     }
     summary_parts = []
     wiki_summary = fetch_wikipedia_summary(topic)
     if wiki_summary:
         summary_parts.append(f"From Wikipedia: {wiki_summary}")
     for name, feed_url in sources.items():
         try:
             items = fetch_rss_feed(feed_url)
@@ -138,6 +165,7 @@ def research_topic(topic: str) -> str:
     print("[DEBUG] Aggregated info from primary sources:")
     print(aggregated_info)
     if not is_sufficient(aggregated_info):
         print("[LOG] Insufficient info from primary sources. Fallback to LLM.")
         additional_info = query_llm_for_additional_info(topic, aggregated_info)
@@ -152,6 +180,9 @@ def research_topic(topic: str) -> str:
     return aggregated_info
 def fetch_wikipedia_summary(topic: str) -> str:
     print("[LOG] Fetching Wikipedia summary for:", topic)
     try:
         search_url = (
@@ -178,6 +209,9 @@ def fetch_wikipedia_summary(topic: str) -> str:
         return ""
 def fetch_rss_feed(feed_url: str) -> list:
     print("[LOG] Fetching RSS feed:", feed_url)
     try:
         resp = requests.get(feed_url)
@@ -192,6 +226,10 @@ def fetch_rss_feed(feed_url: str) -> list:
         return []
 def find_relevant_article(items, topic: str, min_match=2) -> tuple:
     print("[LOG] Finding relevant articles...")
     keywords = re.findall(r'\w+', topic.lower())
     for item in items:
@@ -206,6 +244,9 @@ def find_relevant_article(items, topic: str, min_match=2) -> tuple:
     return None, None, None
 def fetch_article_text(link: str) -> str:
     print("[LOG] Fetching article text from:", link)
     if not link:
         print("[LOG] No link provided for article text.")
@@ -233,6 +274,13 @@ def generate_script(
     guest_name: str = "John",
     sponsor_style: str = "Separate Break"
 ):
     print("[LOG] Generating script with tone:", tone, "and length:", target_length)
     groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
@@ -387,7 +435,8 @@ def transcribe_youtube_video(video_url: str) -> str:
 def generate_audio_mp3(text: str, speaker: str) -> str:
     """
     Calls Deepgram TTS with the text, returning a path to a temp MP3 file.
-    We also do some pre-processing for punctuation, abbreviations, numeric expansions, etc.
     """
     try:
         print(f"[LOG] Generating audio for speaker: {speaker}")
@@ -444,26 +493,25 @@ def transcribe_youtube_video_OLD_YTDLP(video_url: str) -> str:
 def _preprocess_text_for_tts(text: str, speaker: str) -> str:
     """
     1) "SaaS" => "sass"
-    2) Insert periods for uppercase abbreviations => remove them for TTS
-    3) Convert decimals "3.14" => "three point one four"
-    4) For pure integer numbers (e.g. "10", "2023") => "ten", "two thousand twenty three"
     5) Expand leftover all-caps
-    6) Insert fillers if speaker != "Jane"
-    7) Remove random fillers
-    8) Capitalize sentence starts
     """
     # 1) "SaaS" => "sass"
     text = re.sub(r"\b(?i)SaaS\b", "sass", text)
-    # 2) Insert periods for uppercase abbreviations of length >=2 => e.g. "CIA" -> "C.I.A."
     def insert_periods_for_abbrev(m):
         abbr = m.group(0)
         parted = ".".join(list(abbr)) + "."
         return parted
     text = re.sub(r"\b([A-Z0-9]{2,})\b", insert_periods_for_abbrev, text)
-    text = re.sub(r"\.\.", ".", text)  # remove double-dots
-    # 2b) Then remove those periods => TTS won't say "dot"
     def remove_periods_for_tts(m):
         chunk = m.group(0)
         return chunk.replace(".", " ").strip()
@@ -472,7 +520,7 @@ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
     # 3) Hyphens -> spaces
     text = re.sub(r"-", " ", text)
-    # 4) Convert decimals like "3.14" => "three point one four"
     def convert_decimal(m):
         number_str = m.group()
         parts = number_str.split('.')
@@ -481,22 +529,16 @@ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
         return f"{whole_part} point {decimal_part}"
     text = re.sub(r"\b\d+\.\d+\b", convert_decimal, text)
-    # 5) Convert pure integer numbers => e.g. "10" -> "ten", "42" -> "forty two"
-    #    We'll do a quick function for small-ish integers (up to 9999 for demo).
     def convert_int_to_words(m):
         num_str = m.group()
-        # e.g. "10" => 10 => "ten"
-        # "2023" => "two thousand twenty three"
-        # For brevity, handle up to 99999 or so. Or you can import "num2words" for a robust approach.
         return number_to_words(int(num_str))
     text = re.sub(r"\b\d+\b", convert_int_to_words, text)
-    # 6) Expand leftover all-caps abbreviations => "NASA" => "N A S A"
     def expand_abbreviations(m):
         abbrev = m.group()
         if abbrev.endswith('s') and abbrev[:-1].isupper():
-            # Plural e.g. "MPs" => "M Peas"
             singular = abbrev[:-1]
             expanded = " ".join(list(singular)) + "s"
             special_plurals = {
@@ -509,7 +551,15 @@ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
             return " ".join(list(abbrev))
     text = re.sub(r"\b[A-Z]{2,}s?\b", expand_abbreviations, text)
-    # 7) If speaker != Jane, insert filler words around certain keywords
     if speaker != "Jane":
         def insert_thinking_pause(m):
             word = m.group(1)
@@ -521,14 +571,13 @@ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
         keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
         text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
-        # Insert dynamic pauses for certain conjunctions
         conj_pattern = r"\b(and|but|so|because|however)\b"
         text = re.sub(conj_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
-    # 8) Remove random fillers
     text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
-    # 9) Capitalize sentence starts
     def capitalize_match(m):
         return m.group().upper()
     text = re.sub(r'(^\s*\w)|([.!?]\s*\w)', capitalize_match, text)
@@ -537,8 +586,8 @@ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
 def number_to_words(n: int) -> str:
     """
-    Very simple function to convert integers up to 99999 into words for TTS.
-    If you want a robust approach, consider the 'num2words' library.
     """
     if n == 0:
         return "zero"
@@ -546,12 +595,14 @@ def number_to_words(n: int) -> str:
     if n < 0:
         return "minus " + number_to_words(abs(n))
-    # Basic chunking
     ones = ["","one","two","three","four","five","six","seven","eight","nine"]
-    teens = ["ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen","nineteen"]
-    tens_words = ["","","twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety"]
     words = []
     def two_digit_word(x):
         if x == 0:
             return ""
@@ -559,19 +610,16 @@ def number_to_words(n: int) -> str:
             return ones[x]
         if 10 <= x < 20:
             return teens[x-10]
-        # 20+
-        tens_part = x // 10
-        ones_part = x % 10
-        return tens_words[tens_part] + (f" {ones[ones_part]}" if ones_part else "")
-    # Handle thousands
     thousands = n // 1000
     remainder = n % 1000
     if thousands > 0:
         words.append(two_digit_word(thousands))
         words.append("thousand")
-    # Handle hundreds
     hundreds = remainder // 100
     last_two = remainder % 100
     if hundreds > 0:
@@ -586,6 +634,12 @@ def number_to_words(n: int) -> str:
     return " ".join(w for w in words if w).strip()
 def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
     if custom_music_path:
         music_path = custom_music_path
     else:

 )
 def truncate_text(text, max_tokens=2048):
+    """
+    If the text exceeds the max token limit (approx. 2,048), truncate it
+    to avoid exceeding the model's context window.
+    """
     print("[LOG] Truncating text if needed.")
     tokenizer = tiktoken.get_encoding("cl100k_base")
     tokens = tokenizer.encode(text)
     return text
 def extract_text_from_url(url):
+    """
+    Fetches and extracts readable text from a given URL
+    (stripping out scripts, styles, etc.).
+    """
     print("[LOG] Extracting text from URL:", url)
     try:
         headers = {
         return ""
 def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
+    """
+    Shifts the pitch of an AudioSegment by a given number of semitones.
+    Positive semitones shift the pitch up, negative shifts it down.
+    """
     print(f"[LOG] Shifting pitch by {semitones} semitones.")
     new_sample_rate = int(audio.frame_rate * (2.0 ** (semitones / 12.0)))
     shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
     return shifted_audio.set_frame_rate(audio.frame_rate)
 def is_sufficient(text: str, min_word_count: int = 500) -> bool:
+    """
+    Checks if the fetched text meets our sufficiency criteria
+    (e.g., at least 500 words).
+    """
     word_count = len(text.split())
     print(f"[DEBUG] Aggregated word count: {word_count}")
     return word_count >= min_word_count
 def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
+    """
+    Queries the Groq API to retrieve more info from the LLM's knowledge base.
+    Appends it to our aggregated info if found.
+    """
     print("[LOG] Querying LLM for additional information.")
     system_prompt = (
         "You are an AI assistant with extensive knowledge up to 2023-10. "
     return additional_info
 def research_topic(topic: str) -> str:
+    """
+    Gathers info from various RSS feeds and Wikipedia. If needed, queries the LLM
+    for more data if the aggregated text is insufficient.
+    """
     sources = {
         "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
         "CNN": "http://rss.cnn.com/rss/edition.rss",
         "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
         "Google News - Custom": f"https://news.google.com/rss/search?q={requests.utils.quote(topic)}&hl=en-IN&gl=IN&ceid=IN:en",
     }
     summary_parts = []
+    # Wikipedia summary
     wiki_summary = fetch_wikipedia_summary(topic)
     if wiki_summary:
         summary_parts.append(f"From Wikipedia: {wiki_summary}")
+    # For each RSS feed
     for name, feed_url in sources.items():
         try:
             items = fetch_rss_feed(feed_url)
     print("[DEBUG] Aggregated info from primary sources:")
     print(aggregated_info)
+    # Fallback to LLM if insufficient
     if not is_sufficient(aggregated_info):
         print("[LOG] Insufficient info from primary sources. Fallback to LLM.")
         additional_info = query_llm_for_additional_info(topic, aggregated_info)
     return aggregated_info
 def fetch_wikipedia_summary(topic: str) -> str:
+    """
+    Fetch a quick Wikipedia summary of the topic via the official Wikipedia API.
+    """
     print("[LOG] Fetching Wikipedia summary for:", topic)
     try:
         search_url = (
         return ""
 def fetch_rss_feed(feed_url: str) -> list:
+    """
+    Pulls RSS feed data from a given URL and returns items.
+    """
     print("[LOG] Fetching RSS feed:", feed_url)
     try:
         resp = requests.get(feed_url)
         return []
 def find_relevant_article(items, topic: str, min_match=2) -> tuple:
+    """
+    Check each article in the RSS feed for mention of the topic
+    by counting the number of keyword matches.
+    """
     print("[LOG] Finding relevant articles...")
     keywords = re.findall(r'\w+', topic.lower())
     for item in items:
     return None, None, None
 def fetch_article_text(link: str) -> str:
+    """
+    Fetch the article text from the given link (first 5 paragraphs).
+    """
     print("[LOG] Fetching article text from:", link)
     if not link:
         print("[LOG] No link provided for article text.")
     guest_name: str = "John",
     sponsor_style: str = "Separate Break"
 ):
+    """
+    Sends the system_prompt plus input_text to the Groq LLM to generate a
+    multi-speaker Dialogue in JSON, returning a Dialogue object.
+    sponsor_style can be "Separate Break" or "Blended".
+    We add instructions telling the model how to integrate the sponsor content.
+    """
     print("[LOG] Generating script with tone:", tone, "and length:", target_length)
     groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
 def generate_audio_mp3(text: str, speaker: str) -> str:
     """
     Calls Deepgram TTS with the text, returning a path to a temp MP3 file.
+    We also do some pre-processing for punctuation, abbreviations, numeric expansions,
+    plus emotive expressions (ha, sigh, etc.).
     """
     try:
         print(f"[LOG] Generating audio for speaker: {speaker}")
 def _preprocess_text_for_tts(text: str, speaker: str) -> str:
     """
     1) "SaaS" => "sass"
+    2) Insert periods in uppercase abbreviations -> remove for TTS
+    3) Convert decimals like "3.14" -> "three point one four"
+    4) Convert pure integer numbers like "20" -> "twenty"
     5) Expand leftover all-caps
+    6) Emotive placeholders for 'ha', 'haha', 'sigh', 'groan', etc.
+    7) If speaker != Jane, insert filler words
+    8) Remove random fillers
+    9) Capitalize sentence starts
     """
     # 1) "SaaS" => "sass"
     text = re.sub(r"\b(?i)SaaS\b", "sass", text)
+    # 2) Insert periods for uppercase abbreviations (>=2 chars), then remove them
     def insert_periods_for_abbrev(m):
         abbr = m.group(0)
         parted = ".".join(list(abbr)) + "."
         return parted
     text = re.sub(r"\b([A-Z0-9]{2,})\b", insert_periods_for_abbrev, text)
+    text = re.sub(r"\.\.", ".", text)
     def remove_periods_for_tts(m):
         chunk = m.group(0)
         return chunk.replace(".", " ").strip()
     # 3) Hyphens -> spaces
     text = re.sub(r"-", " ", text)
+    # 4) Convert decimals (e.g. "3.14")
     def convert_decimal(m):
         number_str = m.group()
         parts = number_str.split('.')
         return f"{whole_part} point {decimal_part}"
     text = re.sub(r"\b\d+\.\d+\b", convert_decimal, text)
+    # 5) Convert pure integer => words
     def convert_int_to_words(m):
         num_str = m.group()
         return number_to_words(int(num_str))
     text = re.sub(r"\b\d+\b", convert_int_to_words, text)
+    # 6) Expand leftover all-caps => "NASA" => "N A S A"
     def expand_abbreviations(m):
         abbrev = m.group()
         if abbrev.endswith('s') and abbrev[:-1].isupper():
             singular = abbrev[:-1]
             expanded = " ".join(list(singular)) + "s"
             special_plurals = {
             return " ".join(list(abbrev))
     text = re.sub(r"\b[A-Z]{2,}s?\b", expand_abbreviations, text)
+    # 7) Emotive placeholders
+    #   "haha", "ha", "heh", "lol" => "(* laughs *)"
+    text = re.sub(r"\b(ha(ha)?|heh|lol)\b", "(* laughs *)", text, flags=re.IGNORECASE)
+    #   "sigh" => "(* sighs *)"
+    text = re.sub(r"\bsigh\b", "(* sighs *)", text, flags=re.IGNORECASE)
+    #   "groan", "moan" => "(* groans *)"
+    text = re.sub(r"\b(groan|moan)\b", "(* groans *)", text, flags=re.IGNORECASE)
+    # 8) Insert filler words if speaker != Jane
     if speaker != "Jane":
         def insert_thinking_pause(m):
             word = m.group(1)
         keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
         text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
         conj_pattern = r"\b(and|but|so|because|however)\b"
         text = re.sub(conj_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
+    # 9) Remove random fillers
     text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
+    # 10) Capitalize sentence starts
     def capitalize_match(m):
         return m.group().upper()
     text = re.sub(r'(^\s*\w)|([.!?]\s*\w)', capitalize_match, text)
 def number_to_words(n: int) -> str:
     """
+    Basic integer-to-words up to ~99999.
+    For a robust approach, consider the 'num2words' library.
     """
     if n == 0:
         return "zero"
     if n < 0:
         return "minus " + number_to_words(abs(n))
     ones = ["","one","two","three","four","five","six","seven","eight","nine"]
+    teens = ["ten","eleven","twelve","thirteen","fourteen","fifteen",
+             "sixteen","seventeen","eighteen","nineteen"]
+    tens_words = ["","","twenty","thirty","forty","fifty",
+                  "sixty","seventy","eighty","ninety"]
     words = []
     def two_digit_word(x):
         if x == 0:
             return ""
             return ones[x]
         if 10 <= x < 20:
             return teens[x-10]
+        t = x // 10
+        o = x % 10
+        return tens_words[t] + (f" {ones[o]}" if o else "")
     thousands = n // 1000
     remainder = n % 1000
     if thousands > 0:
         words.append(two_digit_word(thousands))
         words.append("thousand")
     hundreds = remainder // 100
     last_two = remainder % 100
     if hundreds > 0:
     return " ".join(w for w in words if w).strip()
 def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
+    """
+    Mixes 'spoken' with a default bg_music.mp3 or user-provided custom music:
+    1) Start with 2 seconds of music alone before speech begins.
+    2) Loop music if shorter than final audio length.
+    3) Lower music volume so speech is clear.
+    """
     if custom_music_path:
         music_path = custom_music_path
     else: