SearchPod1.0

Running

App Files Files Community

siddhartharyaai commited on Jan 31

Commit

66b4c56

verified ·

1 Parent(s): fcd65f7

Update utils.py

Browse files

Files changed (1) hide show

utils.py +606 -358

utils.py CHANGED Viewed

@@ -1,50 +1,43 @@
 import os
-import re
 import json
 import requests
 import tempfile
-from bs4 import BeautifulSoup
-from typing import List, Literal
 from pydantic import BaseModel
 from pydub import AudioSegment, effects
-from transformers import pipeline
-import yt_dlp
 import tiktoken
-import numpy as np
-import torch
-import random
-import base64
-from io import BytesIO
-import pdfkit
-import markdown  # Added for Markdown to HTML conversion
-# Define Dialogue Models
 class DialogueItem(BaseModel):
-    speaker: Literal["Jane", "John"]
-    display_speaker: str = "Jane"
     text: str
 class Dialogue(BaseModel):
     dialogue: List[DialogueItem]
-# Initialize ASR Pipeline (if used elsewhere)
-asr_pipeline = pipeline(
-    "automatic-speech-recognition",
-    model="openai/whisper-tiny.en",
-    device=0 if torch.cuda.is_available() else -1
-)
-def truncate_text(text, max_tokens=2048):
-    print("[LOG] Truncating text if needed.")
-    tokenizer = tiktoken.get_encoding("cl100k_base")
-    tokens = tokenizer.encode(text)
-    if len(tokens) > max_tokens:
-        print("[LOG] Text too long, truncating.")
-        return tokenizer.decode(tokens[:max_tokens])
-    return text
-def extract_text_from_url(url):
     print("[LOG] Extracting text from URL:", url)
     try:
         headers = {
@@ -68,51 +61,79 @@ def extract_text_from_url(url):
         print(f"[ERROR] Exception during text extraction from URL: {e}")
         return ""
-def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
-    print(f"[LOG] Shifting pitch by {semitones} semitones.")
-    new_sample_rate = int(audio.frame_rate * (2.0 ** (semitones / 12.0)))
-    shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
-    return shifted_audio.set_frame_rate(audio.frame_rate)
-def is_sufficient(text: str, min_word_count: int = 500) -> bool:
-    word_count = len(text.split())
-    print(f"[DEBUG] Aggregated word count: {word_count}")
-    return word_count >= min_word_count
-###############################################################################
-# Rewrites text in professional style
-###############################################################################
-def rewrite_in_professional_style(topic: str, raw_text: str) -> str:
-    if not raw_text.strip():
         return ""
-    system_prompt = (
-        "You are a professional writing assistant. Your goal is to rewrite "
-        "the provided text so that it is:\n"
-        "1) Written in clear, fluent, professional English\n"
-        f"2) On-topic about {topic}, removing any extraneous disclaimers or filler\n"
-        "3) Organized in paragraphs or bullet points\n"
-        "4) Maintained or slightly enhanced in detail without significant summarization\n"
-        "5) No references to the rewriting process or disclaimers\n"
-    )
-    user_prompt = f"Please rewrite this text:\n\n{raw_text}"
     try:
-        response = call_deepseek_api(
-            system_prompt=system_prompt,
-            user_prompt=user_prompt,
-            max_tokens=1024,
-            temperature=0.7
-        )
-        return response.strip()
     except Exception as e:
-        print("[ERROR] rewriting text via Deepseek LLM failed:", e)
-        return raw_text
-###############################################################################
-# Event Registry (News API) aggregator
-###############################################################################
 def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
     """
     Query https://eventregistry.org/api/v1/article/getArticles
@@ -161,9 +182,6 @@ def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
         print("[ERROR] Event Registry approach failed:", e)
         return []
-###############################################################################
-# Bing via SerpApi
-###############################################################################
 def fetch_bing_results(query: str, count: int = 10) -> list:
     serp_api_key = os.environ.get("SERP_API_KEY")
     if not serp_api_key:
@@ -193,18 +211,103 @@ def fetch_bing_results(query: str, count: int = 10) -> list:
         print("[ERROR] Bing SerpApi approach failed:", e)
         return []
 ###############################################################################
 # Unified aggregator: google + bing + wiki + rss + event registry + fallback
 ###############################################################################
 def perform_deep_research(topic: str) -> str:
     """
-    1) Google (up to 10) if creds
-    2) Bing (up to 10) if SERP_API_KEY
-    3) Wikipedia summary
-    4) RSS approach
-    5) Event Registry (news api) if NEWS_API_KEY
-    6) If still nothing, use LLM fallback
     """
     # Step 1: Google
     google_cse_id = os.environ.get("GOOGLE_CSE_ID")
     google_api_key = os.environ.get("GOOGLE_API_KEY")
@@ -217,12 +320,12 @@ def perform_deep_research(topic: str) -> str:
                 "q": topic,
                 "cx": google_cse_id,
                 "key": google_api_key,
-                "num": 10
             }
             resp = requests.get(url, params=params, timeout=15)
             resp.raise_for_status()
             data = resp.json()
-            items = data.get("items", [])
             for it in items:
                 google_sources.append({
                     "title": it.get("title", ""),
@@ -233,7 +336,7 @@ def perform_deep_research(topic: str) -> str:
             print("[ERROR] Google approach failed:", e)
     # Step 2: Bing
-    bing_results = fetch_bing_results(topic, count=10)
     # Step 3: Wikipedia summary
     wiki_summary_text = fetch_wikipedia_summary(topic)
@@ -245,7 +348,8 @@ def perform_deep_research(topic: str) -> str:
             "snippet": wiki_summary_text
         }
-    # Step 4: RSS approach
     sources_dict = {
         "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
         "CNN": "http://rss.cnn.com/rss/edition.rss",
@@ -255,8 +359,7 @@ def perform_deep_research(topic: str) -> str:
         "The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
         "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
     }
-    rss_sources = []
-    for name, feed_url in sources_dict.items():
         try:
             items = fetch_rss_feed(feed_url)
             if not items:
@@ -281,9 +384,9 @@ def perform_deep_research(topic: str) -> str:
             continue
     # Step 5: Event Registry
-    event_registry_res = fetch_eventregistry_articles(topic, count=10)
-    # Combine everything
     combined = []
     combined.extend(google_sources)
     combined.extend(bing_results)
@@ -305,42 +408,26 @@ def perform_deep_research(topic: str) -> str:
         }]
         return _draft_professional_report(topic, fallback_data)
     else:
-        # Rewrite each
-        final_list = []
-        idx = 0
-        for source in combined:
-            idx += 1
-            link = source.get("link", "")
-            snippet = source.get("snippet", "")
-            title = source.get("title", "")
-            cleaned_text = rewrite_in_professional_style(topic, snippet)
-            if cleaned_text.strip():
-                final_list.append({
-                    "index": idx,
-                    "title": title,
-                    "link": link,
-                    "cleaned_text": cleaned_text
-                })
-        if not final_list:
-            print("[LOG] Aggregator produced no final content after rewriting. Using LLM fallback.")
-            # LLM-based fallback
-            fallback_text = query_llm_for_additional_info(topic, "")
-            cleaned_fb = rewrite_in_professional_style(topic, fallback_text)
-            fallback_data = [{
-                "index": 1,
-                "title": "Fallback Info",
-                "link": "N/A",
-                "cleaned_text": cleaned_fb
-            }]
-            return _draft_professional_report(topic, fallback_data)
-        return _draft_professional_report(topic, final_list)
 def _draft_professional_report(topic: str, sources_list: list) -> str:
     """
-    Build final professional doc:
     - Title
     - Executive Summary
     - Introduction
@@ -349,6 +436,13 @@ def _draft_professional_report(topic: str, sources_list: list) -> str:
     - Conclusion
     - References footnotes
     Ensures at least ~1000 words.
     """
     merged_text = []
     footnotes = []
@@ -362,32 +456,32 @@ def _draft_professional_report(topic: str, sources_list: list) -> str:
         merged_text.append(text_block)
     all_content = "\n\n".join(merged_text)
-    # ENFORCE LONGER REPORT (~1000 words).
     system_prompt = f"""You are a highly skilled professional research analyst.
-You have access to multiple authoritative sources on the topic: {topic}.
-Your task is to produce a comprehensive and detailed formal research report that includes the following sections:
-1. **Title:** Use the topic as the title of the report.
-2. **Executive Summary:** Provide a concise overview highlighting the key findings and insights.
 3. **Introduction:** Introduce the topic, its relevance, and the scope of the report.
 4. **Main Body:**
-    - **Sub-heading 1:** Summarize insights from Source 1.
-    - **Sub-heading 2:** Summarize insights from Source 2.
-    - *(Continue as needed for all sources)*
-    - **Analysis:** Provide an in-depth analysis combining information from all sources.
-5. **Conclusion:** Present final thoughts, implications, and potential future directions.
 6. **References:** List all sources with numeric footnote markers [^1], [^2], etc.
 **Requirements:**
-- **Length:** The report must be at least **1,000 words** in total.
 - **Content Quality:**
     - Incorporate relevant facts, figures, and statistics.
     - Use professional and clear language.
     - Ensure each section is well-developed without unnecessary repetition.
-- **Structure:** Maintain a logical and cohesive flow throughout the report.
-- **Formatting:** Use proper formatting for headings, sub-headings, and references.
-**Below is the aggregated content from your sources (with footnote references):**
 -----------------------------------------------------------------------
 {all_content}
 -----------------------------------------------------------------------
@@ -404,18 +498,16 @@ Your task is to produce a comprehensive and detailed formal research report that
     # Calculate token counts
     max_tokens = 6000  # OpenRouter's token limit
     system_prompt_tokens = count_tokens(system_prompt)
-    all_content_tokens = count_tokens(all_content)
-    total_tokens = system_prompt_tokens + all_content_tokens
-    print(f"[DEBUG] Total tokens before optimization: {total_tokens}")
-    if total_tokens > max_tokens:
         # Calculate allowed tokens for all_content
         allowed_tokens_for_content = max_tokens - system_prompt_tokens - 100  # Reserve 100 tokens buffer
         if allowed_tokens_for_content <= 0:
             print("[ERROR] System prompt alone exceeds the token limit.")
             return "The system prompt exceeds the token limit. Please reduce the complexity of your research."
         # Truncate all_content to fit
         tokenizer = tiktoken.get_encoding("cl100k_base")
         all_content_tokens_list = tokenizer.encode(all_content)
@@ -428,7 +520,7 @@ Your task is to produce a comprehensive and detailed formal research report that
         response = call_deepseek_api(
             system_prompt=system_prompt,
             user_prompt="",  # No additional user prompt
-            max_tokens=3000,  # Increased to allow more detailed output
             temperature=0.7
         )
         final_report = response.strip()
@@ -441,6 +533,10 @@ Your task is to produce a comprehensive and detailed formal research report that
         print("[ERROR] Could not finalize professional report:", e)
         return "An unexpected error occurred. Please try again later."
 def call_deepseek_api(system_prompt: str, user_prompt: str, max_tokens: int, temperature: float) -> str:
     """
     Function to call DeepSeek R1 via OpenRouter API.
@@ -465,8 +561,17 @@ def call_deepseek_api(system_prompt: str, user_prompt: str, max_tokens: int, tem
         }
         response = requests.post("https://openrouter.ai/api/v1/chat/completions",
                                  headers=headers, data=json.dumps(data))
-        response.raise_for_status()
-        return response.json()["choices"][0]["message"]["content"]
     except requests.exceptions.HTTPError as e:
         status_code = e.response.status_code
         error_content = e.response.json()
@@ -483,108 +588,14 @@ def call_deepseek_api(system_prompt: str, user_prompt: str, max_tokens: int, tem
         print("[ERROR] Could not communicate with OpenRouter API:", e)
         raise ValueError("An unexpected error occurred. Please try again later.")
-def generate_pdf_from_markdown(markdown_text: str) -> bytes:
-    """
-    Converts Markdown text to a PDF file.
-    Args:
-        markdown_text (str): The Markdown content to convert.
-    Returns:
-        bytes: The generated PDF file in bytes.
-    """
-    try:
-        # Convert Markdown to HTML
-        html = markdown.markdown(markdown_text)
-        # Generate PDF from HTML
-        pdf_bytes = pdfkit.from_string(html, False)  # False to return as bytes
-        return pdf_bytes
-    except Exception as e:
-        print(f"[ERROR] Failed to generate PDF from Markdown: {e}")
-        return b""
-def fetch_wikipedia_summary(topic: str) -> str:
-    print("[LOG] Fetching Wikipedia summary for:", topic)
-    try:
-        search_url = (
-            f"https://en.wikipedia.org/w/api.php?action=opensearch&search={requests.utils.quote(topic)}"
-            "&limit=1&namespace=0&format=json"
-        )
-        resp = requests.get(search_url)
-        if resp.status_code != 200:
-            print(f"[ERROR] Failed to fetch Wikipedia search results for {topic}")
-            return ""
-        data = resp.json()
-        if len(data) > 1 and data[1]:
-            title = data[1][0]
-            summary_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(title)}"
-            s_resp = requests.get(summary_url)
-            if s_resp.status_code == 200:
-                s_data = s_resp.json()
-                if "extract" in s_data:
-                    print("[LOG] Wikipedia summary fetched successfully.")
-                    return s_data["extract"]
-        return ""
-    except Exception as e:
-        print(f"[ERROR] Exception during Wikipedia summary fetch: {e}")
-        return ""
-def fetch_rss_feed(feed_url: str) -> list:
-    print("[LOG] Fetching RSS feed:", feed_url)
-    try:
-        resp = requests.get(feed_url)
-        if resp.status_code != 200:
-            print(f"[ERROR] Failed to fetch RSS feed: {feed_url}")
-            return []
-        soup = BeautifulSoup(resp.content, "xml")
-        items = soup.find_all("item")
-        return items
-    except Exception as e:
-        print(f"[ERROR] Exception fetching RSS feed {feed_url}: {e}")
-        return []
-def find_relevant_article(items, topic: str, min_match=2) -> tuple:
-    print("[LOG] Finding relevant articles...")
-    keywords = re.findall(r'\w+', topic.lower())
-    for item in items:
-        title = item.find("title").get_text().strip() if item.find("title") else ""
-        description = item.find("description").get_text().strip() if item.find("description") else ""
-        text = (title + " " + description).lower()
-        matches = sum(1 for kw in keywords if kw in text)
-        if matches >= min_match:
-            link = item.find("link").get_text().strip() if item.find("link") else ""
-            print(f"[LOG] Relevant article found: {title}")
-            return title, description, link
-    return None, None, None
-def fetch_article_text(link: str) -> str:
-    print("[LOG] Fetching article text from:", link)
-    if not link:
-        print("[LOG] No link provided for article text.")
-        return ""
-    try:
-        resp = requests.get(link)
-        if resp.status_code != 200:
-            print(f"[ERROR] Failed to fetch article from {link}")
-            return ""
-        soup = BeautifulSoup(resp.text, 'html.parser')
-        paragraphs = soup.find_all("p")
-        text = " ".join(p.get_text() for p in paragraphs[:10])  # Fetch more paragraphs for depth
-        print("[LOG] Article text fetched successfully.")
-        return text.strip()
-    except Exception as e:
-        print(f"[ERROR] Error fetching article text: {e}")
-        return ""
-###############################################################################
-# Comprehensive Audio Generation Function
-###############################################################################
-def generate_audio_mp3(text: str, speaker: str) -> str:
-    """
-    This function is correctly generating and returning the actual MP3 file path.
-    It utilizes Deepgram for English (American) and Murf for other languages.
     """
     try:
         import streamlit as st
@@ -709,54 +720,367 @@ def generate_audio_mp3(text: str, speaker: str) -> str:
             print(f"[DEBUG] Generated MP3 Path: {final_mp3_path}")
             return final_mp3_path
-    except Exception as e:
-        print("[ERROR] Error generating audio:", e)
-        raise ValueError(f"Error generating audio: {str(e)}")
-def transcribe_youtube_video(video_url: str) -> str:
-    print("[LOG] Transcribing YouTube video via RapidAPI:", video_url)
-    video_id_match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", video_url)
-    if not video_id_match:
-        raise ValueError(f"Invalid YouTube URL: {video_url}, cannot extract video ID.")
-    video_id = video_id_match.group(1)
-    print("[LOG] Extracted video ID:", video_id)
-    base_url = "https://youtube-transcriptor.p.rapidapi.com/transcript"
-    params = {"video_id": video_id, "lang": "en"}
-    headers = {
-        "x-rapidapi-host": "youtube-transcriptor.p.rapidapi.com",
-        "x-rapidapi-key": os.environ.get("RAPIDAPI_KEY")
     }
     try:
-        response = requests.get(base_url, headers=headers, params=params, timeout=30)
-        print("[LOG] RapidAPI Response Status Code:", response.status_code)
-        print("[LOG] RapidAPI Response Body:", response.text)
-        if response.status_code != 200:
-            raise ValueError(f"RapidAPI transcription error: {response.status_code}, {response.text}")
-        data = response.json()
-        if not isinstance(data, list) or not data:
-            raise ValueError(f"Unexpected transcript format or empty transcript: {data}")
-        transcript_as_text = data[0].get('transcriptionAsText', '').strip()
-        if not transcript_as_text:
-            raise ValueError("transcriptionAsText field is missing or empty.")
-        print("[LOG] Transcript retrieval successful.")
-        print(f"[DEBUG] Transcript Length: {len(transcript_as_text)} characters.")
-        snippet = transcript_as_text[:200] + "..." if len(transcript_as_text) > 200 else transcript_as_text
-        print(f"[DEBUG] Transcript Snippet: {snippet}")
-        return transcript_as_text
     except Exception as e:
-        print("[ERROR] RapidAPI transcription error:", e)
-        raise ValueError(f"Error transcribing YouTube video via RapidAPI: {str(e)}")
 ###############################################################################
-# generate_script Function and Helper
 ###############################################################################
 def generate_script(
@@ -826,7 +1150,7 @@ def parse_script_to_dialogue(script_text: str, host_name: str, guest_name: str)
         List[DialogueItem]: A list of DialogueItem objects.
     """
     # Define a regex pattern to identify lines like "HostName: Dialogue"
-    pattern = r"(?i)\b({host}|{guest})\b:\s*(.*)".format(host=re.escape(host_name), guest=re.escape(guest_name))
     matches = re.findall(pattern, script_text)
     dialogue_items = []
@@ -844,53 +1168,6 @@ def parse_script_to_dialogue(script_text: str, host_name: str, guest_name: str)
 # Additional Helper Functions (if any)
 ###############################################################################
-def _preprocess_text_for_tts(text: str, speaker: str) -> str:
-    # Unchanged logic for adding filler words, etc.
-    text = re.sub(r"\bNo\.\b", "Number", text)
-    text = re.sub(r"\b(?i)SaaS\b", "sass", text)
-    abbreviations_as_words = {"NASA", "NATO", "UNESCO"}
-    def insert_periods_for_abbrev(m):
-        abbr = m.group(0)
-        if abbr in abbreviations_as_words:
-            return abbr
-        return ".".join(list(abbr)) + "."
-    text = re.sub(r"\b([A-Z]{2,})\b", insert_periods_for_abbrev, text)
-    text = re.sub(r"\.\.", ".", text)
-    def remove_periods_for_tts(m):
-        return m.group().replace(".", " ").strip()
-    text = re.sub(r"[A-Z]\.[A-Z](?:\.[A-Z])*\.", remove_periods_for_tts, text)
-    text = re.sub(r"-", " ", text)
-    text = re.sub(r"\b(ha(ha)?|heh|lol)\b", "(* laughs *)", text, flags=re.IGNORECASE)
-    text = re.sub(r"\bsigh\b", "(* sighs *)", text, flags=re.IGNORECASE)
-    text = re.sub(r"\b(groan|moan)\b", "(* groans *)", text, flags=re.IGNORECASE)
-    if speaker != "Jane":
-        def insert_thinking_pause(m):
-            word = m.group(1)
-            if random.random() < 0.3:
-                filler = random.choice(['hmm,', 'well,', 'let me see,'])
-                return f"{word}..., {filler}"
-            else:
-                return f"{word}...,"
-        keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
-        text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
-        conj_pattern = r"\b(and|but|so|because|however)\b"
-        text = re.sub(conj_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
-    text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
-    def capitalize_match(m):
-        return m.group().upper()
-    text = re.sub(r'(^\s*\w)|([.!?]\s*\w)', capitalize_match, text)
-    return text.strip()
 def _spell_digits(d: str) -> str:
     digit_map = {
         '0': 'zero', '1': 'one', '2': 'two', '3': 'three',
@@ -899,35 +1176,6 @@ def _spell_digits(d: str) -> str:
     }
     return " ".join(digit_map[ch] for ch in d if ch in digit_map)
-def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
-    # unchanged
-    if custom_music_path:
-        music_path = custom_music_path
-    else:
-        music_path = "bg_music.mp3"
-    if not os.path.exists(music_path):
-        print(f"[ERROR] Background music file not found: {music_path}")
-        return spoken  # Return spoken audio without background music
-    try:
-        bg_music = AudioSegment.from_file(music_path, format="mp3")
-    except Exception as e:
-        print("[ERROR] Failed to load background music:", e)
-        return spoken
-    bg_music = bg_music - 18.0
-    total_length_ms = len(spoken) + 2000
-    looped_music = AudioSegment.empty()
-    while len(looped_music) < total_length_ms:
-        looped_music += bg_music
-    looped_music = looped_music[:total_length_ms]
-    final_mix = looped_music.overlay(spoken, position=2000)
-    return final_mix
 ###############################################################################
-# Unified aggregator: google + bing + wiki + rss + event registry + fallback
 ###############################################################################
-# The perform_deep_research function is already defined above.
-# No need to redefine perform_deep_research again.

 import os
 import json
+import re
 import requests
 import tempfile
+from typing import List
 from pydantic import BaseModel
+from bs4 import BeautifulSoup
 from pydub import AudioSegment, effects
 import tiktoken
+# Define Pydantic Models
 class DialogueItem(BaseModel):
+    speaker: str
+    display_speaker: str
     text: str
 class Dialogue(BaseModel):
     dialogue: List[DialogueItem]
+###############################################################################
+# Helper Functions
+###############################################################################
+def extract_text_from_pdf(pdf_path: str) -> str:
+    print("[LOG] Extracting text from PDF:", pdf_path)
+    try:
+        reader = pypdf.PdfReader(pdf_path)
+        text = ""
+        for page_num, page in enumerate(reader.pages):
+            page_text = page.extract_text()
+            if page_text:
+                text += page_text + "\n"
+        print("[LOG] Text extraction from PDF successful.")
+        return text
+    except Exception as e:
+        print(f"[ERROR] Failed to extract text from PDF: {e}")
+        return ""
+def extract_text_from_url(url: str) -> str:
     print("[LOG] Extracting text from URL:", url)
     try:
         headers = {
         print(f"[ERROR] Exception during text extraction from URL: {e}")
         return ""
+def fetch_wikipedia_summary(topic: str) -> str:
+    print("[LOG] Fetching Wikipedia summary for:", topic)
+    try:
+        search_url = (
+            f"https://en.wikipedia.org/w/api.php?action=opensearch&search={requests.utils.quote(topic)}"
+            "&limit=1&namespace=0&format=json"
+        )
+        resp = requests.get(search_url)
+        if resp.status_code != 200:
+            print(f"[ERROR] Failed to fetch Wikipedia search results for {topic}")
+            return ""
+        data = resp.json()
+        if len(data) > 1 and data[1]:
+            title = data[1][0]
+            summary_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(title)}"
+            s_resp = requests.get(summary_url)
+            if s_resp.status_code == 200:
+                s_data = s_resp.json()
+                if "extract" in s_data:
+                    print("[LOG] Wikipedia summary fetched successfully.")
+                    return s_data["extract"]
+        return ""
+    except Exception as e:
+        print(f"[ERROR] Exception during Wikipedia summary fetch: {e}")
         return ""
+def fetch_rss_feed(feed_url: str) -> list:
+    print("[LOG] Fetching RSS feed:", feed_url)
+    try:
+        resp = requests.get(feed_url)
+        if resp.status_code != 200:
+            print(f"[ERROR] Failed to fetch RSS feed: {feed_url}")
+            return []
+        soup = BeautifulSoup(resp.content, "xml")
+        items = soup.find_all("item")
+        return items
+    except Exception as e:
+        print(f"[ERROR] Exception fetching RSS feed {feed_url}: {e}")
+        return []
+def find_relevant_article(items, topic: str, min_match=2) -> tuple:
+    print("[LOG] Finding relevant articles...")
+    keywords = re.findall(r'\w+', topic.lower())
+    for item in items:
+        title = item.find("title").get_text().strip() if item.find("title") else ""
+        description = item.find("description").get_text().strip() if item.find("description") else ""
+        text = (title + " " + description).lower()
+        matches = sum(1 for kw in keywords if kw in text)
+        if matches >= min_match:
+            link = item.find("link").get_text().strip() if item.find("link") else ""
+            print(f"[LOG] Relevant article found: {title}")
+            return title, description, link
+    return None, None, None
+def fetch_article_text(link: str) -> str:
+    print("[LOG] Fetching article text from:", link)
+    if not link:
+        print("[LOG] No link provided for article text.")
+        return ""
     try:
+        resp = requests.get(link)
+        if resp.status_code != 200:
+            print(f"[ERROR] Failed to fetch article from {link}")
+            return ""
+        soup = BeautifulSoup(resp.text, 'html.parser')
+        paragraphs = soup.find_all("p")
+        text = " ".join(p.get_text() for p in paragraphs[:10])  # Fetch more paragraphs for depth
+        print("[LOG] Article text fetched successfully.")
+        return text.strip()
     except Exception as e:
+        print(f"[ERROR] Error fetching article text: {e}")
+        return ""
 def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
     """
     Query https://eventregistry.org/api/v1/article/getArticles
         print("[ERROR] Event Registry approach failed:", e)
         return []
 def fetch_bing_results(query: str, count: int = 10) -> list:
     serp_api_key = os.environ.get("SERP_API_KEY")
     if not serp_api_key:
         print("[ERROR] Bing SerpApi approach failed:", e)
         return []
+###############################################################################
+# Summarization Function
+###############################################################################
+def summarize_text(text: str, max_length: int = 200) -> str:
+    """
+    Summarizes the given text to the specified maximum word length.
+    Args:
+        text (str): The text to summarize.
+        max_length (int): The maximum number of words in the summary.
+    Returns:
+        str: The summarized text.
+    """
+    system_prompt = (
+        f"You are a professional summarizer. Please condense the following text "
+        f"into a summary of no more than {max_length} words, retaining the main ideas and key details."
+    )
+    user_prompt = text
+    try:
+        summary = call_deepseek_api(
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+            max_tokens=500,  # Adjust as needed
+            temperature=0.5
+        )
+        return summary.strip()
+    except Exception as e:
+        print(f"[ERROR] Summarization failed: {e}")
+        # Fallback: return the original text truncated to max_length words
+        return " ".join(text.split()[:max_length]) + "..."
+###############################################################################
+# Rewrites text in professional style
+###############################################################################
+def rewrite_in_professional_style(topic: str, raw_text: str) -> str:
+    if not raw_text.strip():
+        return ""
+    system_prompt = (
+        "You are a professional writing assistant. Your goal is to rewrite "
+        "the provided text so that it is:\n"
+        "1) Written in clear, fluent, professional English\n"
+        f"2) On-topic about {topic}, removing any extraneous disclaimers or filler\n"
+        "3) Organized in paragraphs or bullet points\n"
+        "4) Maintained or slightly enhanced in detail without significant summarization\n"
+        "5) No references to the rewriting process or disclaimers\n"
+    )
+    user_prompt = f"Please rewrite this text:\n\n{raw_text}"
+    try:
+        rewritten = call_deepseek_api(
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+            max_tokens=1024,
+            temperature=0.7
+        )
+        # Optionally, summarize the rewritten text to further reduce token count
+        summary = summarize_text(rewritten, max_length=150)
+        return summary
+    except Exception as e:
+        print("[ERROR] rewriting text via Deepseek LLM failed:", e)
+        return raw_text
+###############################################################################
+# Event Registry (News API) aggregator
+###############################################################################
+# Already handled in fetch_eventregistry_articles
+###############################################################################
+# Bing via SerpApi
+###############################################################################
+# Already handled in fetch_bing_results
 ###############################################################################
 # Unified aggregator: google + bing + wiki + rss + event registry + fallback
 ###############################################################################
 def perform_deep_research(topic: str) -> str:
     """
+    Perform deep research by aggregating data from multiple sources.
+    Limits the number of sources to prevent exceeding token limits.
+    Summarizes each source's content to reduce token count.
+    Args:
+        topic (str): The research topic.
+    Returns:
+        str: The final professional report in Markdown format.
     """
+    # Define the maximum number of sources per aggregator
+    MAX_SOURCES_PER_AGGREGATOR = 5
     # Step 1: Google
     google_cse_id = os.environ.get("GOOGLE_CSE_ID")
     google_api_key = os.environ.get("GOOGLE_API_KEY")
                 "q": topic,
                 "cx": google_cse_id,
                 "key": google_api_key,
+                "num": 10  # Fetch more to account for filtering
             }
             resp = requests.get(url, params=params, timeout=15)
             resp.raise_for_status()
             data = resp.json()
+            items = data.get("items", [])[:MAX_SOURCES_PER_AGGREGATOR]
             for it in items:
                 google_sources.append({
                     "title": it.get("title", ""),
             print("[ERROR] Google approach failed:", e)
     # Step 2: Bing
+    bing_results = fetch_bing_results(topic, count=10)[:MAX_SOURCES_PER_AGGREGATOR]
     # Step 3: Wikipedia summary
     wiki_summary_text = fetch_wikipedia_summary(topic)
             "snippet": wiki_summary_text
         }
+    # Step 4: RSS approach (NewsAPI assumed here)
+    rss_sources = []
     sources_dict = {
         "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
         "CNN": "http://rss.cnn.com/rss/edition.rss",
         "The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
         "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
     }
+    for name, feed_url in list(sources_dict.items())[:MAX_SOURCES_PER_AGGREGATOR]:
         try:
             items = fetch_rss_feed(feed_url)
             if not items:
             continue
     # Step 5: Event Registry
+    event_registry_res = fetch_eventregistry_articles(topic, count=10)[:MAX_SOURCES_PER_AGGREGATOR]
+    # Combine all sources
     combined = []
     combined.extend(google_sources)
     combined.extend(bing_results)
         }]
         return _draft_professional_report(topic, fallback_data)
     else:
+        # Summarize each source's snippet to reduce token count
+        summarized_list = []
+        for idx, source in enumerate(combined, start=1):
+            summary = summarize_text(source["snippet"], max_length=200)  # Summarize to 200 words
+            summarized_list.append({
+                "index": idx,
+                "title": source["title"],
+                "link": source["link"],
+                "cleaned_text": summary
+            })
+        return _draft_professional_report(topic, summarized_list)
+###############################################################################
+# Professional Report Drafting Function
+###############################################################################
 def _draft_professional_report(topic: str, sources_list: list) -> str:
     """
+    Build a concise professional report:
     - Title
     - Executive Summary
     - Introduction
     - Conclusion
     - References footnotes
     Ensures at least ~1000 words.
+    Args:
+        topic (str): The research topic.
+        sources_list (list): List of summarized sources.
+    Returns:
+        str: The final professional report in Markdown format.
     """
     merged_text = []
     footnotes = []
         merged_text.append(text_block)
     all_content = "\n\n".join(merged_text)
+    # Build the system prompt
     system_prompt = f"""You are a highly skilled professional research analyst.
+You are tasked with creating a comprehensive and detailed formal research report on the topic: {topic}.
+**Report Structure:**
+1. **Title:** {topic}
+2. **Executive Summary:** A concise overview of key findings and insights.
 3. **Introduction:** Introduce the topic, its relevance, and the scope of the report.
 4. **Main Body:**
+    - **Section 1:** Insights from Source 1.
+    - **Section 2:** Insights from Source 2.
+    - *(Continue as needed)*
+    - **Analysis:** An in-depth analysis combining information from all sources.
+5. **Conclusion:** Final thoughts, implications, and potential future directions.
 6. **References:** List all sources with numeric footnote markers [^1], [^2], etc.
 **Requirements:**
+- **Length:** At least 1,000 words.
 - **Content Quality:**
     - Incorporate relevant facts, figures, and statistics.
     - Use professional and clear language.
     - Ensure each section is well-developed without unnecessary repetition.
+- **Structure:** Logical and cohesive flow throughout the report.
+- **Formatting:** Proper formatting for headings, sub-headings, and references.
+**Aggregated Content from Sources:**
 -----------------------------------------------------------------------
 {all_content}
 -----------------------------------------------------------------------
     # Calculate token counts
     max_tokens = 6000  # OpenRouter's token limit
     system_prompt_tokens = count_tokens(system_prompt)
+    print(f"[DEBUG] Total tokens before optimization: {system_prompt_tokens}")
+    if system_prompt_tokens > max_tokens:
         # Calculate allowed tokens for all_content
         allowed_tokens_for_content = max_tokens - system_prompt_tokens - 100  # Reserve 100 tokens buffer
         if allowed_tokens_for_content <= 0:
             print("[ERROR] System prompt alone exceeds the token limit.")
             return "The system prompt exceeds the token limit. Please reduce the complexity of your research."
         # Truncate all_content to fit
         tokenizer = tiktoken.get_encoding("cl100k_base")
         all_content_tokens_list = tokenizer.encode(all_content)
         response = call_deepseek_api(
             system_prompt=system_prompt,
             user_prompt="",  # No additional user prompt
+            max_tokens=3000,  # Adjusted to allow more detailed output
             temperature=0.7
         )
         final_report = response.strip()
         print("[ERROR] Could not finalize professional report:", e)
         return "An unexpected error occurred. Please try again later."
+###############################################################################
+# OpenRouter API Communication Function
+###############################################################################
 def call_deepseek_api(system_prompt: str, user_prompt: str, max_tokens: int, temperature: float) -> str:
     """
     Function to call DeepSeek R1 via OpenRouter API.
         }
         response = requests.post("https://openrouter.ai/api/v1/chat/completions",
                                  headers=headers, data=json.dumps(data))
+        if response.status_code != 200:
+            error_message = response.json().get("error", {}).get("message", "Unknown error")
+            print(f"[ERROR] OpenRouter API error: {response.status_code} - {error_message}")
+            raise ValueError(f"OpenRouter API error: {response.status_code} - {error_message}")
+        response_json = response.json()
+        if "choices" not in response_json or not response_json["choices"]:
+            print("[ERROR] 'choices' key missing in OpenRouter API response.")
+            raise ValueError("Invalid response from OpenRouter API: 'choices' key missing.")
+        return response_json["choices"][0]["message"]["content"]
     except requests.exceptions.HTTPError as e:
         status_code = e.response.status_code
         error_content = e.response.json()
         print("[ERROR] Could not communicate with OpenRouter API:", e)
         raise ValueError("An unexpected error occurred. Please try again later.")
+###############################################################################
+# Comprehensive Audio Generation Function
+###############################################################################
+def generate_audio_mp3(text: str, speaker: str) -> str:
+    """
+    Generates and returns the actual MP3 file path.
+    Utilizes Deepgram for English (American) and Murf for other languages.
     """
     try:
         import streamlit as st
             print(f"[DEBUG] Generated MP3 Path: {final_mp3_path}")
             return final_mp3_path
+    def _preprocess_text_for_tts(text: str, speaker: str) -> str:
+        # Unchanged logic for adding filler words, etc.
+        text = re.sub(r"\bNo\.\b", "Number", text)
+        text = re.sub(r"\b(?i)SaaS\b", "sass", text)
+        abbreviations_as_words = {"NASA", "NATO", "UNESCO"}
+        def insert_periods_for_abbrev(m):
+            abbr = m.group(0)
+            if abbr in abbreviations_as_words:
+                return abbr
+            return ".".join(list(abbr)) + "."
+        text = re.sub(r"\b([A-Z]{2,})\b", insert_periods_for_abbrev, text)
+        text = re.sub(r"\.\.", ".", text)
+        def remove_periods_for_tts(m):
+            return m.group().replace(".", " ").strip()
+        text = re.sub(r"[A-Z]\.[A-Z](?:\.[A-Z])*\.", remove_periods_for_tts, text)
+        text = re.sub(r"-", " ", text)
+        text = re.sub(r"\b(ha(ha)?|heh|lol)\b", "(* laughs *)", text, flags=re.IGNORECASE)
+        text = re.sub(r"\bsigh\b", "(* sighs *)", text, flags=re.IGNORECASE)
+        text = re.sub(r"\b(groan|moan)\b", "(* groans *)", text, flags=re.IGNORECASE)
+        if speaker != "Jane":
+            def insert_thinking_pause(m):
+                word = m.group(1)
+                if random.random() < 0.3:
+                    filler = random.choice(['hmm,', 'well,', 'let me see,'])
+                    return f"{word}..., {filler}"
+                else:
+                    return f"{word}...,"
+            keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
+            text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
+            conj_pattern = r"\b(and|but|so|because|however)\b"
+            text = re.sub(conj_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
+        text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
+        def capitalize_match(m):
+            return m.group().upper()
+        text = re.sub(r'(^\s*\w)|([.!?]\s*\w)', capitalize_match, text)
+        return text.strip()
+###############################################################################
+# Unified aggregator: google + bing + wiki + rss + event registry + fallback
+###############################################################################
+def perform_deep_research(topic: str) -> str:
+    """
+    Perform deep research by aggregating data from multiple sources.
+    Limits the number of sources to prevent exceeding token limits.
+    Summarizes each source's content to reduce token count.
+    Args:
+        topic (str): The research topic.
+    Returns:
+        str: The final professional report in Markdown format.
+    """
+    # Define the maximum number of sources per aggregator
+    MAX_SOURCES_PER_AGGREGATOR = 5
+    # Step 1: Google
+    google_cse_id = os.environ.get("GOOGLE_CSE_ID")
+    google_api_key = os.environ.get("GOOGLE_API_KEY")
+    google_sources = []
+    if google_cse_id and google_api_key:
+        try:
+            print("[LOG] Attempting Google CSE for topic:", topic)
+            url = "https://customsearch.googleapis.com/customsearch/v1"
+            params = {
+                "q": topic,
+                "cx": google_cse_id,
+                "key": google_api_key,
+                "num": 10  # Fetch more to account for filtering
+            }
+            resp = requests.get(url, params=params, timeout=15)
+            resp.raise_for_status()
+            data = resp.json()
+            items = data.get("items", [])[:MAX_SOURCES_PER_AGGREGATOR]
+            for it in items:
+                google_sources.append({
+                    "title": it.get("title", ""),
+                    "link": it.get("link", ""),
+                    "snippet": it.get("snippet", "")
+                })
+        except Exception as e:
+            print("[ERROR] Google approach failed:", e)
+    # Step 2: Bing
+    bing_results = fetch_bing_results(topic, count=10)[:MAX_SOURCES_PER_AGGREGATOR]
+    # Step 3: Wikipedia summary
+    wiki_summary_text = fetch_wikipedia_summary(topic)
+    wiki_item = None
+    if wiki_summary_text:
+        wiki_item = {
+            "title": "Wikipedia Summary",
+            "link": f"https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}",
+            "snippet": wiki_summary_text
+        }
+    # Step 4: RSS approach (NewsAPI assumed here)
+    rss_sources = []
+    sources_dict = {
+        "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
+        "CNN": "http://rss.cnn.com/rss/edition.rss",
+        "Associated Press": "https://apnews.com/apf-topnews",
+        "NDTV": "https://www.ndtv.com/rss/top-stories",
+        "Times of India": "https://timesofindia.indiatimes.com/rssfeeds/296589292.cms",
+        "The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
+        "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
     }
+    for name, feed_url in list(sources_dict.items())[:MAX_SOURCES_PER_AGGREGATOR]:
+        try:
+            items = fetch_rss_feed(feed_url)
+            if not items:
+                continue
+            title, desc, link = find_relevant_article(items, topic, min_match=2)
+            if link:
+                article_text = fetch_article_text(link)
+                if article_text:
+                    rss_sources.append({
+                        "title": f"{name} RSS Article",
+                        "link": link,
+                        "snippet": article_text
+                    })
+                else:
+                    rss_sources.append({
+                        "title": f"{name} RSS Article",
+                        "link": link,
+                        "snippet": f"{title} - {desc}"
+                    })
+        except Exception as e:
+            print(f"[ERROR] Error fetching from {name} RSS feed:", e)
+            continue
+    # Step 5: Event Registry
+    event_registry_res = fetch_eventregistry_articles(topic, count=10)[:MAX_SOURCES_PER_AGGREGATOR]
+    # Combine all sources
+    combined = []
+    combined.extend(google_sources)
+    combined.extend(bing_results)
+    if wiki_item:
+        combined.append(wiki_item)
+    combined.extend(rss_sources)
+    combined.extend(event_registry_res)
+    if not combined:
+        print("[LOG] No results found from aggregator. Using LLM fallback.")
+        # LLM-based fallback
+        fallback_text = query_llm_for_additional_info(topic, "")
+        cleaned_fb = rewrite_in_professional_style(topic, fallback_text)
+        fallback_data = [{
+            "index": 1,
+            "title": "Fallback Info",
+            "link": "N/A",
+            "cleaned_text": cleaned_fb
+        }]
+        return _draft_professional_report(topic, fallback_data)
+    else:
+        # Summarize each source's snippet to reduce token count
+        summarized_list = []
+        for idx, source in enumerate(combined, start=1):
+            summary = summarize_text(source["snippet"], max_length=200)  # Summarize to 200 words
+            summarized_list.append({
+                "index": idx,
+                "title": source["title"],
+                "link": source["link"],
+                "cleaned_text": summary
+            })
+        return _draft_professional_report(topic, summarized_list)
+###############################################################################
+# Professional Report Drafting Function
+###############################################################################
+def _draft_professional_report(topic: str, sources_list: list) -> str:
+    """
+    Build a concise professional report:
+    - Title
+    - Executive Summary
+    - Introduction
+    - Main Body with sub-headings
+    - Analysis
+    - Conclusion
+    - References footnotes
+    Ensures at least ~1000 words.
+    Args:
+        topic (str): The research topic.
+        sources_list (list): List of summarized sources.
+    Returns:
+        str: The final professional report in Markdown format.
+    """
+    merged_text = []
+    footnotes = []
+    for s in sources_list:
+        footnotes.append(f"[^{s['index']}]: {s['link']}")
+        text_block = (
+            f"Source {s['index']} Title: {s['title']}\n"
+            f"FootnoteRef: [^{s['index']}]\n"
+            f"Text:\n{s['cleaned_text']}\n"
+        )
+        merged_text.append(text_block)
+    all_content = "\n\n".join(merged_text)
+    # Build the system prompt
+    system_prompt = f"""You are a highly skilled professional research analyst.
+You are tasked with creating a comprehensive and detailed formal research report on the topic: {topic}.
+**Report Structure:**
+1. **Title:** {topic}
+2. **Executive Summary:** A concise overview of key findings and insights.
+3. **Introduction:** Introduce the topic, its relevance, and the scope of the report.
+4. **Main Body:**
+    - **Section 1:** Insights from Source 1.
+    - **Section 2:** Insights from Source 2.
+    - *(Continue as needed)*
+    - **Analysis:** An in-depth analysis combining information from all sources.
+5. **Conclusion:** Final thoughts, implications, and potential future directions.
+6. **References:** List all sources with numeric footnote markers [^1], [^2], etc.
+**Requirements:**
+- **Length:** At least 1,000 words.
+- **Content Quality:**
+    - Incorporate relevant facts, figures, and statistics.
+    - Use professional and clear language.
+    - Ensure each section is well-developed without unnecessary repetition.
+- **Structure:** Logical and cohesive flow throughout the report.
+- **Formatting:** Proper formatting for headings, sub-headings, and references.
+**Aggregated Content from Sources:**
+-----------------------------------------------------------------------
+{all_content}
+-----------------------------------------------------------------------
+**Footnotes:**
+{chr(10).join(footnotes)}
+"""
+    # Token Counting Function
+    def count_tokens(text: str) -> int:
+        tokenizer = tiktoken.get_encoding("cl100k_base")
+        tokens = tokenizer.encode(text)
+        return len(tokens)
+    # Calculate token counts
+    max_tokens = 6000  # OpenRouter's token limit
+    system_prompt_tokens = count_tokens(system_prompt)
+    print(f"[DEBUG] Total tokens before optimization: {system_prompt_tokens}")
+    if system_prompt_tokens > max_tokens:
+        # Calculate allowed tokens for all_content
+        allowed_tokens_for_content = max_tokens - system_prompt_tokens - 100  # Reserve 100 tokens buffer
+        if allowed_tokens_for_content <= 0:
+            print("[ERROR] System prompt alone exceeds the token limit.")
+            return "The system prompt exceeds the token limit. Please reduce the complexity of your research."
+        # Truncate all_content to fit
+        tokenizer = tiktoken.get_encoding("cl100k_base")
+        all_content_tokens_list = tokenizer.encode(all_content)
+        truncated_tokens = all_content_tokens_list[:allowed_tokens_for_content]
+        truncated_content = tokenizer.decode(truncated_tokens)
+        system_prompt = system_prompt.replace(all_content, truncated_content + "\n\n[Content truncated to fit token limits.]")
+        print(f"[DEBUG] Truncated content to fit token limits: {len(truncated_tokens)} tokens")
     try:
+        response = call_deepseek_api(
+            system_prompt=system_prompt,
+            user_prompt="",  # No additional user prompt
+            max_tokens=3000,  # Adjusted to allow more detailed output
+            temperature=0.7
+        )
+        final_report = response.strip()
+        # Optionally, check word count
+        word_count = len(final_report.split())
+        if word_count < 1000:
+            print(f"[WARNING] Generated report is below desired length: {word_count} words.")
+        return final_report
+    except Exception as e:
+        print("[ERROR] Could not finalize professional report:", e)
+        return "An unexpected error occurred. Please try again later."
+###############################################################################
+# PDF Generation Function
+###############################################################################
+def generate_pdf_from_markdown(markdown_text: str) -> bytes:
+    """
+    Converts Markdown text to a PDF file.
+    Args:
+        markdown_text (str): The Markdown content to convert.
+    Returns:
+        bytes: The generated PDF file in bytes.
+    """
+    try:
+        # Convert Markdown to HTML
+        import markdown
+        import pdfkit
+        html = markdown.markdown(markdown_text)
+        # Generate PDF from HTML
+        pdf_bytes = pdfkit.from_string(html, False)  # False to return as bytes
+        return pdf_bytes
+    except Exception as e:
+        print(f"[ERROR] Failed to generate PDF from Markdown: {e}")
+        return b""
+###############################################################################
+# Audio Mixing Function
+###############################################################################
+def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
+    """
+    Mixes spoken audio with background music.
+    Args:
+        spoken (AudioSegment): The spoken audio segment.
+        custom_music_path (str, optional): Path to custom background music. Defaults to None.
+    Returns:
+        AudioSegment: The mixed audio segment.
+    """
+    # unchanged
+    if custom_music_path:
+        music_path = custom_music_path
+    else:
+        music_path = "bg_music.mp3"
+    if not os.path.exists(music_path):
+        print(f"[ERROR] Background music file not found: {music_path}")
+        return spoken  # Return spoken audio without background music
+    try:
+        bg_music = AudioSegment.from_file(music_path, format="mp3")
     except Exception as e:
+        print("[ERROR] Failed to load background music:", e)
+        return spoken
+    bg_music = bg_music - 18.0
+    total_length_ms = len(spoken) + 2000
+    looped_music = AudioSegment.empty()
+    while len(looped_music) < total_length_ms:
+        looped_music += bg_music
+    looped_music = looped_music[:total_length_ms]
+    final_mix = looped_music.overlay(spoken, position=2000)
+    return final_mix
 ###############################################################################
+# Generate Script Function and Helper
 ###############################################################################
 def generate_script(
         List[DialogueItem]: A list of DialogueItem objects.
     """
     # Define a regex pattern to identify lines like "HostName: Dialogue"
+    pattern = rf"(?i)\b({re.escape(host_name)}|{re.escape(guest_name)})\b:\s*(.*)"
     matches = re.findall(pattern, script_text)
     dialogue_items = []
 # Additional Helper Functions (if any)
 ###############################################################################
 def _spell_digits(d: str) -> str:
     digit_map = {
         '0': 'zero', '1': 'one', '2': 'two', '3': 'three',
     }
     return " ".join(digit_map[ch] for ch in d if ch in digit_map)
 ###############################################################################
+# End of utils.py
 ###############################################################################