MyPod_10

Running

App Files Files Community

siddhartharyaai commited on Jan 16

Commit

74cfa8d

verified ·

1 Parent(s): 43b0279

Update utils.py

Browse files

Files changed (1) hide show

utils.py +439 -450

utils.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# utils.py
 import os
 import re
 import json
@@ -15,11 +13,10 @@ import tiktoken
 from groq import Groq
 import numpy as np
 import torch
-import random
 class DialogueItem(BaseModel):
-    speaker: Literal["Jane", "John"]   # TTS voice
-    display_speaker: str = "Jane"      # For display in transcript
     text: str
 class Dialogue(BaseModel):
@@ -47,8 +44,7 @@ def truncate_text(text, max_tokens=2048):
 def extract_text_from_url(url):
     """
-    Fetches and extracts readable text from a given URL
-    (stripping out scripts, styles, etc.).
     """
     print("[LOG] Extracting text from URL:", url)
     try:
@@ -85,8 +81,7 @@ def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
 def is_sufficient(text: str, min_word_count: int = 500) -> bool:
     """
-    Checks if the fetched text meets our sufficiency criteria
-    (e.g., at least 500 words).
     """
     word_count = len(text.split())
     print(f"[DEBUG] Aggregated word count: {word_count}")
@@ -98,6 +93,7 @@ def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
     Appends it to our aggregated info if found.
     """
     print("[LOG] Querying LLM for additional information.")
     system_prompt = (
         "You are an AI assistant with extensive knowledge up to 2023-10. "
         "Provide additional relevant information on the following topic based on your knowledge base.\n\n"
@@ -105,7 +101,9 @@ def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
         f"Existing Information: {existing_text}\n\n"
         "Please add more insightful details, facts, and perspectives to enhance the understanding of the topic."
     )
     groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
     try:
         response = groq_client.chat.completions.create(
             messages=[{"role": "system", "content": system_prompt}],
@@ -113,19 +111,22 @@ def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
             max_tokens=1024,
             temperature=0.7
         )
     except Exception as e:
         print("[ERROR] Groq API error during fallback:", e)
         return ""
-    additional_info = response.choices[0].message.content.strip()
-    print("[DEBUG] Additional information from LLM:")
-    print(additional_info)
-    return additional_info
 def research_topic(topic: str) -> str:
     """
     Gathers info from various RSS feeds and Wikipedia. If needed, queries the LLM
     for more data if the aggregated text is insufficient.
     """
     sources = {
         "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
         "CNN": "http://rss.cnn.com/rss/edition.rss",
@@ -137,484 +138,472 @@ def research_topic(topic: str) -> str:
         "Google News - Custom": f"https://news.google.com/rss/search?q={requests.utils.quote(topic)}&hl=en-IN&gl=IN&ceid=IN:en",
     }
-    summary_parts = []
-    # Wikipedia summary
-    wiki_summary = fetch_wikipedia_summary(topic)
-    if wiki_summary:
-        summary_parts.append(f"From Wikipedia: {wiki_summary}")
-    # For each RSS feed
-    for name, feed_url in sources.items():
-        try:
-            items = fetch_rss_feed(feed_url)
-            if not items:
-                continue
-            title, desc, link = find_relevant_article(items, topic, min_match=2)
-            if link:
-                article_text = fetch_article_text(link)
-                if article_text:
-                    summary_parts.append(f"From {name}: {article_text}")
-                else:
-                    summary_parts.append(f"From {name}: {title} - {desc}")
-        except Exception as e:
-            print(f"[ERROR] Error fetching from {name} RSS feed:", e)
-            continue
-    aggregated_info = " ".join(summary_parts)
-    print("[DEBUG] Aggregated info from primary sources:")
-    print(aggregated_info)
-    # If not enough data, fallback to LLM
-    if not is_sufficient(aggregated_info):
-        print("[LOG] Insufficient info from primary sources. Fallback to LLM.")
-        additional_info = query_llm_for_additional_info(topic, aggregated_info)
-        if additional_info:
-            aggregated_info += " " + additional_info
-        else:
-            print("[ERROR] Failed to retrieve additional info from LLM.")
-    if not aggregated_info:
-        return f"Sorry, I couldn't find recent information on '{topic}'."
-    return aggregated_info
 def fetch_wikipedia_summary(topic: str) -> str:
-    """
-    Fetch a quick Wikipedia summary of the topic via the official Wikipedia API.
-    """
-    print("[LOG] Fetching Wikipedia summary for:", topic)
-    try:
-        search_url = (
-            f"https://en.wikipedia.org/w/api.php?action=opensearch&search={requests.utils.quote(topic)}"
-            "&limit=1&namespace=0&format=json"
-        )
-        resp = requests.get(search_url)
-        if resp.status_code != 200:
-            print(f"[ERROR] Failed to fetch Wikipedia search results for {topic}")
-            return ""
-        data = resp.json()
-        if len(data) > 1 and data[1]:
-            title = data[1][0]
-            summary_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(title)}"
-            s_resp = requests.get(summary_url)
-            if s_resp.status_code == 200:
-                s_data = s_resp.json()
-                if "extract" in s_data:
-                    print("[LOG] Wikipedia summary fetched successfully.")
-                    return s_data["extract"]
-        return ""
-    except Exception as e:
-        print(f"[ERROR] Exception during Wikipedia summary fetch: {e}")
-        return ""
 def fetch_rss_feed(feed_url: str) -> list:
-    """
-    Pulls RSS feed data from a given URL and returns items.
-    """
-    print("[LOG] Fetching RSS feed:", feed_url)
-    try:
-        resp = requests.get(feed_url)
-        if resp.status_code != 200:
-            print(f"[ERROR] Failed to fetch RSS feed: {feed_url}")
-            return []
-        soup = BeautifulSoup(resp.content, "xml")
-        items = soup.find_all("item")
-        return items
-    except Exception as e:
-        print(f"[ERROR] Exception fetching RSS feed {feed_url}: {e}")
-        return []
 def find_relevant_article(items, topic: str, min_match=2) -> tuple:
-    """
-    Check each article in the RSS feed for mention of the topic
-    by counting the number of keyword matches.
-    """
-    print("[LOG] Finding relevant articles...")
-    keywords = re.findall(r'\w+', topic.lower())
-    for item in items:
-        title = item.find("title").get_text().strip() if item.find("title") else ""
-        description = item.find("description").get_text().strip() if item.find("description") else ""
-        text = (title + " " + description).lower()
-        matches = sum(1 for kw in keywords if kw in text)
-        if matches >= min_match:
-            link = item.find("link").get_text().strip() if item.find("link") else ""
-            print(f"[LOG] Relevant article found: {title}")
-            return title, description, link
-    return None, None, None
 def fetch_article_text(link: str) -> str:
-    """
-    Fetch the article text from the given link (first 5 paragraphs).
-    """
-    print("[LOG] Fetching article text from:", link)
-    if not link:
-        print("[LOG] No link provided for article text.")
-        return ""
-    try:
-        resp = requests.get(link)
-        if resp.status_code != 200:
-            print(f"[ERROR] Failed to fetch article from {link}")
-            return ""
-        soup = BeautifulSoup(resp.text, 'html.parser')
-        paragraphs = soup.find_all("p")
-        text = " ".join(p.get_text() for p in paragraphs[:5])  # first 5 paragraphs
-        print("[LOG] Article text fetched successfully.")
-        return text.strip()
-    except Exception as e:
-        print(f"[ERROR] Error fetching article text: {e}")
-        return ""
 def generate_script(
-    system_prompt: str,
-    input_text: str,
-    tone: str,
-    target_length: str,
-    host_name: str = "Jane",
-    guest_name: str = "John",
-    sponsor_style: str = "Separate Break",
-    sponsor_provided=None  # Accept sponsor_provided parameter
 ):
-    print("[LOG] Generating script with tone:", tone, "and length:", target_length)
-    groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
-    words_per_minute = 150
-    numeric_minutes = 3
-    match = re.search(r"(\d+)", target_length)
-    if match:
-        numeric_minutes = int(match.group(1))
-    min_words = max(50, numeric_minutes * 100)
-    max_words = numeric_minutes * words_per_minute
-    tone_map = {
-        "Humorous": "funny and exciting, makes people chuckle",
-        "Formal": "business-like, well-structured, professional",
-        "Casual": "like a conversation between close friends, relaxed and informal",
-        "Youthful": "like how teenagers might chat, energetic and lively"
-    }
-    chosen_tone = tone_map.get(tone, "casual")
-    # Determine sponsor instructions based on sponsor_provided and sponsor_style
-    if sponsor_provided:
-        if sponsor_style == "Separate Break":
-            sponsor_instructions = (
-                "If sponsor content is provided, include it in a separate ad break (~30 seconds). "
-                "Use phrasing like 'Now a word from our sponsor...' and end with 'Back to the show' or similar."
-            )
-        else:
-            sponsor_instructions = (
-                "If sponsor content is provided, blend it naturally (~30 seconds) into the conversation. "
-                "Avoid abrupt transitions."
-            )
-    else:
-        sponsor_instructions = ""  # No sponsor instructions if sponsor_provided is empty
-    prompt = (
-        f"{system_prompt}\n"
-        f"TONE: {chosen_tone}\n"
-        f"TARGET LENGTH: {target_length} (~{min_words}-{max_words} words)\n"
-        f"INPUT TEXT: {input_text}\n\n"
-        f"# Sponsor Style Instruction:\n{sponsor_instructions}\n\n"
-        "Please provide the output in the following JSON format without any additional text:\n\n"
-        "{\n"
-        '    "dialogue": [\n'
-        '        {\n'
-        '            "speaker": "Jane",\n'
-        '            "text": "..." \n'
-        '        },\n'
-        '        {\n'
-        '            "speaker": "John",\n'
-        '            "text": "..." \n'
-        '        }\n'
-        "    ]\n"
-        "}"
-    )
-    print("[LOG] Sending prompt to Groq:")
-    print(prompt)
-    try:
-        response = groq_client.chat.completions.create(
-            messages=[{"role": "system", "content": prompt}],
-            model="llama-3.3-70b-versatile",
-            max_tokens=2048,
-            temperature=0.7
-        )
-    except Exception as e:
-        print("[ERROR] Groq API error:", e)
-        raise ValueError(f"Error communicating with Groq API: {str(e)}")
-    raw_content = response.choices[0].message.content.strip()
-    start_index = raw_content.find('{')
-    end_index = raw_content.rfind('}')
-    if start_index == -1 or end_index == -1:
-        raise ValueError("Failed to parse dialogue: No JSON found.")
-    json_str = raw_content[start_index:end_index+1].strip()
-    try:
-        data = json.loads(json_str)
-        dialogue_list = data.get("dialogue", [])
-        for d in dialogue_list:
-            raw_speaker = d.get("speaker", "Jane")
-            if raw_speaker.lower() == host_name.lower():
-                d["speaker"] = "Jane"
-                d["display_speaker"] = host_name
-            elif raw_speaker.lower() == guest_name.lower():
-                d["speaker"] = "John"
-                d["display_speaker"] = guest_name
-            else:
-                d["speaker"] = "Jane"
-                d["display_speaker"] = raw_speaker
-        new_dialogue_items = []
-        for d in dialogue_list:
-            if "display_speaker" not in d:
-                d["display_speaker"] = d["speaker"]
-            new_dialogue_items.append(DialogueItem(**d))
-        return Dialogue(dialogue=new_dialogue_items)
-    except json.JSONDecodeError as e:
-        print("[ERROR] JSON decoding (format) failed:", e)
-        raise ValueError(f"Failed to parse dialogue: {str(e)}")
-    except Exception as e:
-        print("[ERROR] JSON decoding failed:", e)
-        raise ValueError(f"Failed to parse dialogue: {str(e)}")
-def transcribe_youtube_video(video_url: str) -> str:
-    print("[LOG] Transcribing YouTube video via RapidAPI:", video_url)
-    video_id_match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", video_url)
-    if not video_id_match:
-        raise ValueError(f"Invalid YouTube URL: {video_url}, cannot extract video ID.")
-    video_id = video_id_match.group(1)
-    print("[LOG] Extracted video ID:", video_id)
-    base_url = "https://youtube-transcriptor.p.rapidapi.com/transcript"
-    params = {
-        "video_id": video_id,
-        "lang": "en"
-    }
-    headers = {
-        "x-rapidapi-host": "youtube-transcriptor.p.rapidapi.com",
-        "x-rapidapi-key": os.environ.get("RAPIDAPI_KEY")
-    }
-    try:
-        response = requests.get(base_url, headers=headers, params=params, timeout=30)
-        print("[LOG] RapidAPI Response Status Code:", response.status_code)
-        print("[LOG] RapidAPI Response Body:", response.text)
-        if response.status_code != 200:
-            raise ValueError(f"RapidAPI transcription error: {response.status_code}, {response.text}")
-        data = response.json()
-        if not isinstance(data, list) or not data:
-            raise ValueError(f"Unexpected transcript format or empty transcript: {data}")
-        transcript_as_text = data[0].get('transcriptionAsText', '').strip()
-        if not transcript_as_text:
-            raise ValueError("transcriptionAsText field is missing or empty.")
-        print("[LOG] Transcript retrieval successful.")
-        print(f"[DEBUG] Transcript Length: {len(transcript_as_text)} characters.")
-        snippet = transcript_as_text[:200] + "..." if len(transcript_as_text) > 200 else transcript_as_text
-        print(f"[DEBUG] Transcript Snippet: {snippet}")
-        return transcript_as_text
-    except Exception as e:
-        print("[ERROR] RapidAPI transcription error:", e)
-        raise ValueError(f"Error transcribing YouTube video via RapidAPI: {str(e)}")
-def generate_audio_mp3(text: str, speaker: str) -> str:
-    """
-    Calls Deepgram TTS with the text, returning a path to a temp MP3 file.
-    We also do some pre-processing for punctuation, abbreviations, numeric expansions,
-    plus emotive expressions (ha, sigh, etc.).
-    """
-    try:
-        print(f"[LOG] Generating audio for speaker: {speaker}")
-        processed_text = _preprocess_text_for_tts(text, speaker)
-        deepgram_api_url = "https://api.deepgram.com/v1/speak"
-        params = {
-            "model": "aura-asteria-en",  # female by default
-        }
-        if speaker == "John":
-            params["model"] = "aura-zeus-en"
-        headers = {
-            "Accept": "audio/mpeg",
-            "Content-Type": "application/json",
-            "Authorization": f"Token {os.environ.get('DEEPGRAM_API_KEY')}"
-        }
-        body = {
-            "text": processed_text
-        }
-        response = requests.post(deepgram_api_url, params=params, headers=headers, json=body, stream=True)
-        if response.status_code != 200:
-            raise ValueError(f"Deepgram TTS error: {response.status_code}, {response.text}")
-        content_type = response.headers.get('Content-Type', '')
-        if 'audio/mpeg' not in content_type:
-            raise ValueError("Unexpected Content-Type from Deepgram.")
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as mp3_file:
-            for chunk in response.iter_content(chunk_size=8192):
-                if chunk:
-                    mp3_file.write(chunk)
-            mp3_path = mp3_file.name
-        # Normalize volume
-        audio_seg = AudioSegment.from_file(mp3_path, format="mp3")
-        audio_seg = effects.normalize(audio_seg)
-        final_mp3_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
-        audio_seg.export(final_mp3_path, format="mp3")
-        if os.path.exists(mp3_path):
-            os.remove(mp3_path)
-        return final_mp3_path
-    except Exception as e:
-        print("[ERROR] Error generating audio:", e)
-        raise ValueError(f"Error generating audio: {str(e)}")
-def transcribe_youtube_video_OLD_YTDLP(video_url: str) -> str:
-    pass
-def _preprocess_text_for_tts(text: str, speaker: str) -> str:
-    """
-    1) "SaaS" => "sass"
-    2) Insert periods for uppercase abbreviations -> remove for TTS
-    3) Preserve numbers for natural TTS pronunciation
-    4) Expand leftover all-caps
-    5) Emotive placeholders for 'ha', 'haha', 'sigh', 'groan', etc.
-    6) If speaker != Jane, insert filler words
-    7) Remove random fillers
-    8) Capitalize sentence starts
-    """
-    # 1) "SaaS" => "sass"
-    text = re.sub(r"\b(?i)SaaS\b", "sass", text)
-    # 2) Insert periods in uppercase abbreviations (>=2 chars), then remove them
-    def insert_periods_for_abbrev(m):
-        abbr = m.group(0)
-        parted = ".".join(list(abbr)) + "."
-        return parted
-    text = re.sub(r"\b([A-Z0-9]{2,})\b", insert_periods_for_abbrev, text)
-    text = re.sub(r"\.\.", ".", text)
-    def remove_periods_for_tts(m):
-        chunk = m.group(0)
-        return chunk.replace(".", " ").strip()
-    text = re.sub(r"[A-Z0-9]\.[A-Z0-9](?:\.[A-Z0-9])*\.", remove_periods_for_tts, text)
-    # 3) Preserve numbers by removing any digit-specific processing
-    # Let TTS handle natural number pronunciation
-    # 4) Hyphens -> spaces (but preserve hyphenated numbers)
-    text = re.sub(r"(?<!\d)-(?!\d)", " ", text)
-    # 5) Emotive placeholders
-    text = re.sub(r"\b(ha(ha)?|heh|lol)\b", "(* laughs *)", text, flags=re.IGNORECASE)
-    text = re.sub(r"\bsigh\b", "(* sighs *)", text, flags=re.IGNORECASE)
-    text = re.sub(r"\b(groan|moan)\b", "(* groans *)", text, flags=re.IGNORECASE)
-    # 6) Insert filler words if speaker != "Jane"
-    if speaker != "Jane":
-        def insert_thinking_pause(m):
-            word = m.group(1)
-            if random.random() < 0.3:
-                filler = random.choice(['hmm,', 'well,', 'let me see,'])
-                return f"{word}..., {filler}"
-            else:
-                return f"{word}...,"
-        keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
-        text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
-        conj_pattern = r"\b(and|but|so|because|however)\b"
-        text = re.sub(conj_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
-    # 7) Remove random fillers
-    text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
-    # 8) Capitalize sentence starts
-    def capitalize_match(m):
-        return m.group().upper()
-    text = re.sub(r'(^\s*\w)|([.!?]\s*\w)', capitalize_match, text)
-    return text.strip()
-def _spell_digits(d: str) -> str:
-    """
-    Convert individual digits '3' -> 'three'.
-    """
-    digit_map = {
-        '0': 'zero',
-        '1': 'one',
-        '2': 'two',
-        '3': 'three',
-        '4': 'four',
-        '5': 'five',
-        '6': 'six',
-        '7': 'seven',
-        '8': 'eight',
-        '9': 'nine'
-    }
-    return " ".join(digit_map[ch] for ch in d if ch in digit_map)
-def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
-    """
-    Mixes 'spoken' with a default bg_music.mp3 or user-provided custom music:
-    1) Start with 2 seconds of music alone before speech begins.
-    2) Loop the music if it's shorter than the final audio length.
-    3) Lower music volume so the speech is clear.
-    """
-    if custom_music_path:
-        music_path = custom_music_path
-    else:
-        music_path = "bg_music.mp3"
-    try:
-        bg_music = AudioSegment.from_file(music_path, format="mp3")
-    except Exception as e:
-        print("[ERROR] Failed to load background music:", e)
-        return spoken
-    bg_music = bg_music - 18.0
-    total_length_ms = len(spoken) + 2000
-    looped_music = AudioSegment.empty()
-    while len(looped_music) < total_length_ms:
-        looped_music += bg_music
-    looped_music = looped_music[:total_length_ms]
-    final_mix = looped_music.overlay(spoken, position=2000)
-    return final_mix
-# This function is new for short Q&A calls
-def call_groq_api_for_qa(system_prompt: str) -> str:
-    """
-    A minimal placeholder for your short Q&A LLM call.
-    Must return a JSON string, e.g.:
-    {"speaker": "John", "text": "Short answer here"}
-    """
-    groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
-    try:
-        response = groq_client.chat.completions.create(
-            messages=[{"role": "system", "content": system_prompt}],
-            model="llama-3.3-70b-versatile",
-            max_tokens=512,
-            temperature=0.7
-        )
-    except Exception as e:
-        print("[ERROR] Groq API error:", e)
-        fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering right now."}
-        return json.dumps(fallback)
-    raw_content = response.choices[0].message.content.strip()
-    return raw_content

 import os
 import re
 import json
 from groq import Groq
 import numpy as np
 import torch
 class DialogueItem(BaseModel):
+    speaker: Literal["Jane", "John"]  # TTS voice
+    display_speaker: str = "Jane"  # For display in transcript
     text: str
 class Dialogue(BaseModel):
 def extract_text_from_url(url):
     """
+    Fetches and extracts readable text from a given URL (stripping out scripts, styles, etc.).
     """
     print("[LOG] Extracting text from URL:", url)
     try:
 def is_sufficient(text: str, min_word_count: int = 500) -> bool:
     """
+    Checks if the fetched text meets our sufficiency criteria (e.g., at least 500 words).
     """
     word_count = len(text.split())
     print(f"[DEBUG] Aggregated word count: {word_count}")
     Appends it to our aggregated info if found.
     """
     print("[LOG] Querying LLM for additional information.")
     system_prompt = (
         "You are an AI assistant with extensive knowledge up to 2023-10. "
         "Provide additional relevant information on the following topic based on your knowledge base.\n\n"
         f"Existing Information: {existing_text}\n\n"
         "Please add more insightful details, facts, and perspectives to enhance the understanding of the topic."
     )
     groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
     try:
         response = groq_client.chat.completions.create(
             messages=[{"role": "system", "content": system_prompt}],
             max_tokens=1024,
             temperature=0.7
         )
+        additional_info = response.choices[0].message.content.strip()
+        print("[DEBUG] Additional information from LLM:")
+        print(additional_info)
+        return additional_info
     except Exception as e:
         print("[ERROR] Groq API error during fallback:", e)
         return ""
 def research_topic(topic: str) -> str:
     """
     Gathers info from various RSS feeds and Wikipedia. If needed, queries the LLM
     for more data if the aggregated text is insufficient.
     """
     sources = {
         "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
         "CNN": "http://rss.cnn.com/rss/edition.rss",
         "Google News - Custom": f"https://news.google.com/rss/search?q={requests.utils.quote(topic)}&hl=en-IN&gl=IN&ceid=IN:en",
     }
+   summary_parts = []  # Wikipedia summary
+   wiki_summary = fetch_wikipedia_summary(topic)
+   if wiki_summary:
+       summary_parts.append(f"From Wikipedia: {wiki_summary}")
+   # For each RSS feed
+   for name, feed_url in sources.items():
+       try:
+           items = fetch_rss_feed(feed_url)
+           if not items:
+               continue
+           title, desc, link = find_relevant_article(items, topic, min_match=2)
+           if link:
+               article_text = fetch_article_text(link)
+               if article_text:
+                   summary_parts.append(f"From {name}: {article_text}")
+               else:
+                   summary_parts.append(f"From {name}: {title} - {desc}")
+       except Exception as e:
+           print(f"[ERROR] Error fetching from {name} RSS feed:", e)
+           continue
+   aggregated_info = " ".join(summary_parts)
+   print("[DEBUG] Aggregated info from primary sources:")
+   print(aggregated_info)
+   # If not enough data, fallback to LLM
+   if not is_sufficient(aggregated_info):
+       print("[LOG] Insufficient info from primary sources. Fallback to LLM.")
+       additional_info = query_llm_for_additional_info(topic, aggregated_info)
+       if additional_info:
+           aggregated_info += " " + additional_info
+       else:
+           print("[ERROR] Failed to retrieve additional info from LLM.")
+   if not aggregated_info:
+       return f"Sorry, I couldn't find recent information on '{topic}'."
+   return aggregated_info
 def fetch_wikipedia_summary(topic: str) -> str:
+   """
+   Fetch a quick Wikipedia summary of the topic via the official Wikipedia API.
+   """
+   print("[LOG] Fetching Wikipedia summary for:", topic)
+   try:
+       search_url = (
+           f"https://en.wikipedia.org/w/api.php?action=opensearch&search={requests.utils.quote(topic)}"
+           "&limit=1&namespace=0&format=json"
+       )
+       resp = requests.get(search_url)
+       if resp.status_code != 200:
+           print(f"[ERROR] Failed to fetch Wikipedia search results for {topic}")
+           return ""
+       data = resp.json()
+       if len(data) > 1 and data[1]:
+           title = data[1][0]
+           summary_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(title)}"
+           s_resp = requests.get(summary_url)
+           if s_resp.status_code == 200:
+               s_data = s_resp.json()
+               if "extract" in s_data:
+                   print("[LOG] Wikipedia summary fetched successfully.")
+                   return s_data["extract"]
+       return ""
+   except Exception as e:
+       print(f"[ERROR] Exception during Wikipedia summary fetch: {e}")
+       return ""
 def fetch_rss_feed(feed_url: str) -> list:
+   """
+   Pulls RSS feed data from a given URL and returns items.
+   """
+   print("[LOG] Fetching RSS feed:", feed_url)
+   try:
+       resp = requests.get(feed_url)
+       if resp.status_code != 200:
+           print(f"[ERROR] Failed to fetch RSS feed: {feed_url}")
+           return []
+       soup = BeautifulSoup(resp.content, "xml")
+       items = soup.find_all("item")
+       return items
+   except Exception as e:
+       print(f"[ERROR] Exception fetching RSS feed {feed_url}: {e}")
+       return []
 def find_relevant_article(items, topic: str, min_match=2) -> tuple:
+   """
+   Check each article in the RSS feed for mention of the topic by counting
+   the number of keyword matches.
+   """
+   print("[LOG] Finding relevant articles...")
+   keywords = re.findall(r'\w+', topic.lower())
+   for item in items:
+       title = item.find("title").get_text().strip() if item.find("title") else ""
+       description = item.find("description").get_text().strip() if item.find("description") else ""
+       text = (title + " " + description).lower()
+       matches = sum(1 for kw in keywords if kw in text)
+       if matches >= min_match:
+           link = item.find("link").get_text().strip() if item.find("link") else ""
+           print(f"[LOG] Relevant article found: {title}")
+           return title, description, link
+   return None, None, None
 def fetch_article_text(link: str) -> str:
+   """
+   Fetch the article text from the given link (first 5 paragraphs).
+   """
+   print("[LOG] Fetching article text from:", link)
+   if not link:
+       print("[LOG] No link provided for article text.")
+       return ""
+   try:
+       resp = requests.get(link)
+       if resp.status_code != 200:
+           print(f"[ERROR] Failed to fetch article from {link}")
+           return ""
+       soup = BeautifulSoup(resp.text, 'html.parser')
+       paragraphs = soup.find_all("p")
+       text = " ".join(p.get_text() for p in paragraphs[:5])  # first 5 paragraphs
+       print("[LOG] Article text fetched successfully.")
+       return text.strip()
+   except Exception as e:
+       print(f"[ERROR] Error fetching article text: {e}")
+       return ""
 def generate_script(
+   system_prompt: str,
+   input_text: str,
+   tone: str,
+   target_length: str,
+   host_name: str = "Jane",
+   guest_name: str = "John",
+   sponsor_style: str = "Separate Break",
+   sponsor_provided=None  # Accept sponsor_provided parameter
 ):
+   print("[LOG] Generating script with tone:", tone, "and length:", target_length)
+   groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+   words_per_minute = 150
+   numeric_minutes = 3
+   match = re.search(r"(\d+)", target_length)
+   if match:
+       numeric_minutes = int(match.group(1))
+   min_words = max(50, numeric_minutes * 100)
+   max_words = numeric_minutes * words_per_minute
+# Tone mapping dictionary
+tone_map={
+      "Humorous":"funny and exciting,makes people chuckle",
+      "Formal":"business-like,wells-structured,"professional",
+      "Casual":"like a conversation between close friends,"relaxed and informal",
+      "Youthful":"like how teenagers might chat,"energetic and lively"
+}
+chosen_tone=tone_map.get(tone,"casual")
+# Determine sponsor instructions based on sponsor_provided and sponsor_style
+if sponsor_provided:
+if sponsor_style=="Separate Break":
+sponsor_instructions=(
+"If sponsor content is provided,"include it in a separate ad break (~30 seconds). "
+"Use phrasing like 'Now a word from our sponsor...' and end with 'Back to the show' or similar."
+)
+else:
+sponsor_instructions=(
+"If sponsor content is provided,"blend it naturally (~30 seconds) into the conversation."
+"Avoid abrupt transitions."
+)
+else:
+sponsor_instructions="" # No sponsor instructions if sponsor_provided is empty
+prompt=(
+f"{system_prompt}\n"
+f"TONE:{chosen_tone}\n"
+f"TARGET LENGTH:{target_length} (~{min_words}-{max_words} words)\n"
+f"INPUT TEXT:{input_text}\n\n"
+f"# Sponsor Style Instruction:\n{sponsor_instructions}\n\n"
+"Please provide the output in the following JSON format without any additional text:\n\n"
+"{\n"
+' "dialogue":[\n'
+' {\n'
+' "speaker":"Jane",\n'
+' "text":"..."\n'
+' },\n'
+' {\n'
+' "speaker":"John",\n'
+' "text":"..."\n'
+' }\n'
+" ]\n"
+"}"
+)
+print("[LOG] Sending prompt to Groq:")
+print(prompt)
+try:
+response=groq_client.chat.completions.create(
+messages=[{"role":"system","content":prompt}],
+model="llama-3.3-70b-versatile",
+max_tokens=2048,
+temperature=0.7
+except Exception as e:
+print("[ERROR] Groq API error:", e)
+raise ValueError(f"Error communicating with Groq API:{str(e)}")
+raw_content=response.choices[0].message.content.strip()
+start_index=raw_content.find('{')
+end_index=raw_content.rfind('}')
+if start_index==-1 or end_index==-1:
+raise ValueError("Failed to parse dialogue:No JSON found.")
+json_str=raw_content[start_index:end_index+1].strip()
+try:
+data=json.loads(json_str)
+dialogue_list=data.get("dialogue",[])
+for d in dialogue_list:
+raw_speaker=d.get("speaker","Jane")
+if raw_speaker.lower()==host_name.lower():
+d["speaker"]="Jane"
+d["display_speaker"]=host_name
+elif raw_speaker.lower()==guest_name.lower():
+d["speaker"]="John"
+d["display_speaker"]=guest_name
+else:
+d["speaker"]="Jane"
+d["display_speaker"]=raw_speaker
+new_dialogue_items=[]
+for d in dialogue_list:
+if “display_speaker” not in d:
+d[“display_speaker”]=d[“speaker”]
+new_dialogue_items.append(DialogueItem(**d))
+return Dialogue(dialogue=new_dialogue_items)
+except json.JSONDecodeError as e:
+print("[ERROR] JSON decoding(format) failed:", e)
+raise ValueError(f"Failed to parse dialogue:{str(e)}")
+except Exception as e:
+print("[ERROR] JSON decoding failed:", e)
+raise ValueError(f"Failed to parse dialogue:{str(e)}")
+def transcribe_youtube_video(video_url:str)->str:
+print("[LOG] Transcribing YouTube video via RapidAPI:", video_url)
+video_id_match=re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", video_url)
+if not video_id_match:
+raise ValueError(f"Invalid YouTube URL:{video_url},cannot extract video ID.")
+video_id=video_id_match.group(1)
+print("[LOG] Extracted video ID:", video_id)
+base_url="https://youtube-transcriptor.p.rapidapi.com/transcript"
+params={
+"video_id":video_id,
+"lang":"en"
+}
+headers={
+"x-rapidapi-host":"youtube-transcriptor.p.rapidapi.com",
+"x-rapidapi-key":os.environ.get("RAPIDAPI_KEY")
+}
+try:
+response=requests.get(base_url,headers=headers,params=params,timeouot=30)
+print("[LOG] RapidAPI Response Status Code:",response.status_code)
+print("[LOG] RapidAPI Response Body:",response.text)
+if response.status_code!=200:
+raise ValueError(f"RapidAPI transcription error:{response.status_code},{response.text}")
+data=response.json()
+if not isinstance(data,list) or not data:
+raise ValueError(f"Unexpected transcript format or empty transcript:{data}")
+transcript_as_text=data[0].get('transcriptionAsText','').strip()
+if not transcript_as_text:
+raise ValueError("transcriptionAsText field is missing or empty.")
+print("[LOG] Transcript retrieval successful.")
+print(f"[DEBUG] Transcript Length:{len(transcript_as_text)} characters.")
+snippet=transcript_as_text[:200]+"..."if len(transcript_as_text)>200 else transcript_as_text
+print(f"[DEBUG] Transcript Snippet:{snippet}")
+return transcript_as_text
+except Exception as e:
+print("[ERROR] RapidAPI transcription error:",e)
+raise ValueError(f"Error transcribing YouTube video via RapidAPI:{str(e)}")
+def generate_audio_mp3(text:str,speaker:str)->str:
+"""
+Calls Deepgram TTS with the text returning a path to a temp MP3 file.
+We also do some pre-processing for punctuation abbreviations,
+numeric expansions plus emotive expressions (ha sigh etc.).
+"""
+try:
+print(f"[LOG] Generating audio for speaker:{speaker}")
+processed_text=_preprocess_text_for_tts(text,speaker)
+deepgram_api_url="https://api.deepgram.com/v1/speak"
+params={
+"model":"aura-asteria-en", # female by default
+}
+if speaker=="John":
+params["model"]="aura-zeus-en"
+headers={
+"Accept":"audio/mpeg",
+"Content-Type":"application/json",
+"Authorization":f"Token{os.environ.get('DEEPGRAM_API_KEY')}"
+}
+body={
+"text":processed_text
+}
+response=requests.post(deepgram_api_url,param=params ,headers=headers,json=body ,stream=True)
+if response.status_code!=200:
+raise ValueError(f"Deepgram TTS error:{response.status_code},{response.text}")
+content_type=response.headers