SearchPod1.0

Running

App Files Files Community

siddhartharyaai commited on Feb 3

Commit

26794f8

verified ·

1 Parent(s): e6f486e

Update utils.py

Browse files

Files changed (1) hide show

utils.py +330 -785

utils.py CHANGED Viewed

@@ -1,59 +1,56 @@
 import os
-import json
 import re
 import requests
 import tempfile
-import time
-import logging
-from typing import List
-from pydantic import BaseModel
 from bs4 import BeautifulSoup
 from pydub import AudioSegment, effects
 import tiktoken
-import pypdf
-import markdown
-import pdfkit
 import random
-import warnings
-from cryptography.utils import CryptographyDeprecationWarning
-from ratelimit import limits, sleep_and_retry
-import streamlit as st
-# Suppress Cryptography Deprecation Warnings
-warnings.filterwarnings("ignore", category=CryptographyDeprecationWarning)
-# Configure Logging
-logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
-# Define Pydantic Models
 class DialogueItem(BaseModel):
-    speaker: str
-    display_speaker: str
     text: str
 class Dialogue(BaseModel):
     dialogue: List[DialogueItem]
-###############################################################################
-# Helper Functions
-###############################################################################
-def extract_text_from_pdf(pdf_path: str) -> str:
-    print("[LOG] Extracting text from PDF:", pdf_path)
-    try:
-        reader = pypdf.PdfReader(pdf_path)
-        text = ""
-        for page_num, page in enumerate(reader.pages):
-            page_text = page.extract_text()
-            if page_text:
-                text += page_text + "\n"
-        print("[LOG] Text extraction from PDF successful.")
-        return text
-    except Exception as e:
-        print(f"[ERROR] Failed to extract text from PDF: {e}")
-        return ""
-def extract_text_from_url(url: str) -> str:
     print("[LOG] Extracting text from URL:", url)
     try:
         headers = {
@@ -77,120 +74,77 @@ def extract_text_from_url(url: str) -> str:
         print(f"[ERROR] Exception during text extraction from URL: {e}")
         return ""
-def fetch_wikipedia_summary(topic: str) -> str:
-    print("[LOG] Fetching Wikipedia summary for:", topic)
-    try:
-        search_url = (
-            f"https://en.wikipedia.org/w/api.php?action=opensearch&search={requests.utils.quote(topic)}"
-            "&limit=1&namespace=0&format=json"
-        )
-        resp = requests.get(search_url)
-        if resp.status_code != 200:
-            print(f"[ERROR] Failed to fetch Wikipedia search results for {topic}")
-            return ""
-        data = resp.json()
-        if len(data) > 1 and data[1]:
-            title = data[1][0]
-            summary_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(title)}"
-            s_resp = requests.get(summary_url)
-            if s_resp.status_code == 200:
-                s_data = s_resp.json()
-                if "extract" in s_data:
-                    print("[LOG] Wikipedia summary fetched successfully.")
-                    return s_data["extract"]
-        return ""
-    except Exception as e:
-        print(f"[ERROR] Exception during Wikipedia summary fetch: {e}")
-        return ""
-def fetch_rss_feed(feed_url: str) -> list:
-    print("[LOG] Fetching RSS feed:", feed_url)
-    try:
-        resp = requests.get(feed_url)
-        if resp.status_code != 200:
-            print(f"[ERROR] Failed to fetch RSS feed: {feed_url}")
-            return []
-        soup = BeautifulSoup(resp.content, "xml")
-        items = soup.find_all("item")
-        return items
-    except Exception as e:
-        print(f"[ERROR] Exception fetching RSS feed {feed_url}: {e}")
-        return []
-def find_relevant_article(items, topic: str, min_match=2) -> tuple:
-    print("[LOG] Finding relevant articles...")
-    keywords = re.findall(r'\w+', topic.lower())
-    for item in items:
-        title = item.find("title").get_text().strip() if item.find("title") else ""
-        description = item.find("description").get_text().strip() if item.find("description") else ""
-        text = (title + " " + description).lower()
-        matches = sum(1 for kw in keywords if kw in text)
-        if matches >= min_match:
-            link = item.find("link").get_text().strip() if item.find("link") else ""
-            print(f"[LOG] Relevant article found: {title}")
-            return title, description, link
-    return None, None, None
-def fetch_article_text(link: str) -> str:
-    print("[LOG] Fetching article text from:", link)
-    if not link:
-        print("[LOG] No link provided for article text.")
         return ""
     try:
-        resp = requests.get(link)
-        if resp.status_code != 200:
-            print(f"[ERROR] Failed to fetch article from {link}")
-            return ""
-        soup = BeautifulSoup(resp.text, 'html.parser')
-        paragraphs = soup.find_all("p")
-        text = " ".join(p.get_text() for p in paragraphs[:10])  # Fetch more paragraphs for depth
-        print("[LOG] Article text fetched successfully.")
-        return text.strip()
     except Exception as e:
-        print(f"[ERROR] Error fetching article text: {e}")
-        return ""
 def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
-    """
-    Query https://eventregistry.org/api/v1/article/getArticles
-    with the env var NEWS_API_KEY, searching for 'topic'.
-    Return list of {title, link, snippet}.
-    """
     news_api_key = os.environ.get("NEWS_API_KEY")
     if not news_api_key:
         print("[ERROR] Missing NEWS_API_KEY for Event Registry.")
         return []
     print("[LOG] Attempting Event Registry for topic:", topic)
     endpoint = "https://eventregistry.org/api/v1/article/getArticles"
-    # Minimal example request body
     body = {
         "action": "getArticles",
         "keyword": topic,
         "articlesPage": 1,
-        "articlesCount": count,  # up to 100, we do count=10 for uniformity
         "articlesSortBy": "date",
         "articlesSortByAsc": False,
         "dataType": ["news", "pr"],
-        "forceMaxDataTimeWindow": 31,  # last month
         "resultType": "articles",
         "apiKey": news_api_key
     }
     try:
         resp = requests.post(endpoint, json=body, timeout=20)
         resp.raise_for_status()
         data = resp.json()
-        # According to docs, articles can be found at data["articles"]["results"]
         art_data = data.get("articles", {})
         results_arr = art_data.get("results", [])
         ret = []
         for item in results_arr:
-            # item might have "title", "url", "body" or "titleUri"
             title = item.get("title", "")
             url = item.get("url", "")
-            # pick either "body" or "excerpt"
             snippet = item.get("body", "") or item.get("excerpt", "")
             ret.append({"title": title, "link": url, "snippet": snippet})
         return ret
@@ -198,6 +152,9 @@ def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
         print("[ERROR] Event Registry approach failed:", e)
         return []
 def fetch_bing_results(query: str, count: int = 10) -> list:
     serp_api_key = os.environ.get("SERP_API_KEY")
     if not serp_api_key:
@@ -227,635 +184,11 @@ def fetch_bing_results(query: str, count: int = 10) -> list:
         print("[ERROR] Bing SerpApi approach failed:", e)
         return []
-###############################################################################
-# Summarization Function
-###############################################################################
-def summarize_text(text: str, max_length: int = 200) -> str:
-    """
-    Summarizes the given text to the specified maximum word length.
-    Args:
-        text (str): The text to summarize.
-        max_length (int): The maximum number of words in the summary.
-    Returns:
-        str: The summarized text.
-    """
-    system_prompt = (
-        f"You are a professional summarizer. Please condense the following text "
-        f"into a summary of no more than {max_length} words, retaining the main ideas and key details."
-    )
-    user_prompt = text
-    try:
-        summary = call_deepseek_api_cached(
-            system_prompt=system_prompt,
-            user_prompt=user_prompt,
-            max_tokens=500,  # Adjust as needed
-            temperature=0.5
-        )
-        return summary.strip()
-    except Exception as e:
-        print(f"[ERROR] Summarization failed: {e}")
-        # Fallback: return the original text truncated to max_length words
-        return " ".join(text.split()[:max_length]) + "..."
-###############################################################################
-# Rewrites text in professional style
-###############################################################################
-def rewrite_in_professional_style(topic: str, raw_text: str) -> str:
-    if not raw_text.strip():
-        return ""
-    system_prompt = (
-        "You are a professional writing assistant. Your goal is to rewrite "
-        "the provided text so that it is:\n"
-        "1) Written in clear, fluent, professional English\n"
-        f"2) On-topic about {topic}, removing any extraneous disclaimers or filler\n"
-        "3) Organized in paragraphs or bullet points\n"
-        "4) Maintained or slightly enhanced in detail without significant summarization\n"
-        "5) No references to the rewriting process or disclaimers\n"
-    )
-    user_prompt = f"Please rewrite this text:\n\n{raw_text}"
-    try:
-        rewritten = call_deepseek_api_cached(
-            system_prompt=system_prompt,
-            user_prompt=user_prompt,
-            max_tokens=1024,
-            temperature=0.7
-        )
-        # Optionally, summarize the rewritten text to further reduce token count
-        summary = summarize_text(rewritten, max_length=150)
-        return summary
-    except Exception as e:
-        print("[ERROR] Rewriting text via Deepseek LLM failed:", e)
-        return raw_text
-###############################################################################
-# OpenRouter API Communication Function with Exponential Backoff and Rate Limiting
-###############################################################################
-ONE_MINUTE = 60
-@sleep_and_retry
-@limits(calls=5, period=ONE_MINUTE)  # Adjust based on OpenRouter's rate limits
-def call_deepseek_api(system_prompt: str, user_prompt: str, max_tokens: int, temperature: float, max_retries: int = 5) -> str:
-    """
-    Function to call DeepSeek R1 via OpenRouter API with exponential backoff for rate limiting.
-    """
-    logging.info("Communicating with DeepSeek R1 via OpenRouter API.")
-    headers = {
-        "Authorization": f"Bearer {os.environ.get('DEEPSEEK_API_KEY')}",
-        "Content-Type": "application/json",
-        # Optional headers for OpenRouter leaderboard
-        # "HTTP-Referer": "<YOUR_SITE_URL>",
-        # "X-Title": "<YOUR_SITE_NAME>",
-    }
-    data = {
-        "model": "deepseek/deepseek-r1:free",  # Ensure this model name is correct
-        "messages": [
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_prompt}
-        ],
-        "max_tokens": max_tokens,
-        "temperature": temperature
-    }
-    attempt = 0
-    backoff_time = 1  # Start with 1 second
-    while attempt < max_retries:
-        try:
-            response = requests.post("https://openrouter.ai/api/v1/chat/completions",
-                                     headers=headers, data=json.dumps(data))
-            logging.debug(f"OpenRouter API Response Status: {response.status_code}")
-            logging.debug(f"OpenRouter API Response Body: {response.text}")
-            if response.status_code == 200:
-                response_json = response.json()
-                if "choices" in response_json and response_json["choices"]:
-                    return response_json["choices"][0]["message"]["content"]
-                else:
-                    logging.error("'choices' key missing in OpenRouter API response.")
-                    raise ValueError("Invalid response from OpenRouter API: 'choices' key missing.")
-            elif response.status_code == 429:
-                # Rate limit exceeded
-                retry_after = response.headers.get("Retry-After")
-                if retry_after:
-                    wait_time = int(retry_after)
-                else:
-                    wait_time = backoff_time
-                logging.warning(f"Rate limit exceeded. Attempt {attempt + 1} of {max_retries}. Retrying in {wait_time} seconds...")
-                time.sleep(wait_time)
-                backoff_time *= 2  # Exponential backoff
-                attempt += 1
-            else:
-                # Handle other HTTP errors
-                try:
-                    error_message = response.json().get("error", {}).get("message", "Unknown error")
-                except json.JSONDecodeError:
-                    error_message = "Non-JSON response received."
-                logging.error(f"OpenRouter API error: {response.status_code} - {error_message}")
-                raise ValueError(f"OpenRouter API error: {response.status_code} - {error_message}")
-        except requests.exceptions.RequestException as e:
-            logging.error(f"Request exception: {e}. Attempt {attempt + 1} of {max_retries}. Retrying in {backoff_time} seconds...")
-            time.sleep(backoff_time)
-            backoff_time *= 2
-            attempt += 1
-    # After max retries
-    logging.error("Max retries exceeded. Failed to get a valid response from OpenRouter API.")
-    raise ValueError("Rate limit exceeded. Please try again later.")
-@st.cache_data(show_spinner=False)
-def call_deepseek_api_cached(system_prompt: str, user_prompt: str, max_tokens: int, temperature: float) -> str:
-    return call_deepseek_api(system_prompt, user_prompt, max_tokens, temperature)
-###############################################################################
-# Professional Report Drafting Function
-###############################################################################
-def _draft_professional_report(topic: str, sources_list: list) -> str:
-    """
-    Build a concise professional report:
-    - Title
-    - Executive Summary
-    - Introduction
-    - Main Body with sub-headings
-    - Analysis
-    - Conclusion
-    - References footnotes
-    Ensures at least ~1000 words.
-    Args:
-        topic (str): The research topic.
-        sources_list (list): List of summarized sources.
-    Returns:
-        str: The final professional report in Markdown format.
-    """
-    merged_text = []
-    footnotes = []
-    for s in sources_list:
-        footnotes.append(f"[^{s['index']}]: {s['link']}")
-        text_block = (
-            f"Source {s['index']} Title: {s['title']}\n"
-            f"FootnoteRef: [^{s['index']}]\n"
-            f"Text:\n{s['cleaned_text']}\n"
-        )
-        merged_text.append(text_block)
-    all_content = "\n\n".join(merged_text)
-    # Build the system prompt
-    system_prompt = f"""You are a highly skilled professional research analyst.
-You are tasked with creating a comprehensive and detailed formal research report on the topic: {topic}.
-**Report Structure:**
-1. **Title:** {topic}
-2. **Executive Summary:** A concise overview of key findings and insights.
-3. **Introduction:** Introduce the topic, its relevance, and the scope of the report.
-4. **Main Body:**
-    - **Section 1:** Insights from Source 1.
-    - **Section 2:** Insights from Source 2.
-    - *(Continue as needed)*
-    - **Analysis:** An in-depth analysis combining information from all sources.
-5. **Conclusion:** Final thoughts, implications, and potential future directions.
-6. **References:** List all sources with numeric footnote markers [^1], [^2], etc.
-**Requirements:**
-- **Length:** At least 1,000 words.
-- **Content Quality:**
-    - Incorporate relevant facts, figures, and statistics.
-    - Use professional and clear language.
-    - Ensure each section is well-developed without unnecessary repetition.
-- **Structure:** Logical and cohesive flow throughout the report.
-- **Formatting:** Proper formatting for headings, sub-headings, and references.
-**Aggregated Content from Sources:**
------------------------------------------------------------------------
-{all_content}
------------------------------------------------------------------------
-**Footnotes:**
-{chr(10).join(footnotes)}
-"""
-    # Token Counting Function
-    def count_tokens(text: str) -> int:
-        tokenizer = tiktoken.get_encoding("cl100k_base")
-        tokens = tokenizer.encode(text)
-        return len(tokens)
-    # Calculate token counts
-    max_tokens = 6000  # OpenRouter's token limit
-    system_prompt_tokens = count_tokens(system_prompt)
-    logging.debug(f"Total tokens before optimization: {system_prompt_tokens}")
-    if system_prompt_tokens > max_tokens:
-        # Calculate allowed tokens for all_content
-        allowed_tokens_for_content = max_tokens - system_prompt_tokens - 100  # Reserve 100 tokens buffer
-        if allowed_tokens_for_content <= 0:
-            logging.error("System prompt alone exceeds the token limit.")
-            return "The system prompt exceeds the token limit. Please reduce the complexity of your research."
-        # Truncate all_content to fit
-        tokenizer = tiktoken.get_encoding("cl100k_base")
-        all_content_tokens_list = tokenizer.encode(all_content)
-        truncated_tokens = all_content_tokens_list[:allowed_tokens_for_content]
-        truncated_content = tokenizer.decode(truncated_tokens)
-        system_prompt = system_prompt.replace(all_content, truncated_content + "\n\n[Content truncated to fit token limits.]")
-        logging.debug(f"Truncated content to fit token limits: {len(truncated_tokens)} tokens")
-    try:
-        response = call_deepseek_api_cached(
-            system_prompt=system_prompt,
-            user_prompt="",  # No additional user prompt
-            max_tokens=3000,  # Adjusted to allow more detailed output
-            temperature=0.7
-        )
-        final_report = response.strip()
-        # Optionally, check word count
-        word_count = len(final_report.split())
-        if word_count < 1000:
-            logging.warning(f"Generated report is below desired length: {word_count} words.")
-        return final_report
-    except Exception as e:
-        logging.error(f"Could not finalize professional report: {e}")
-        return "An unexpected error occurred. Please try again later."
-###############################################################################
-# PDF Generation Function
-###############################################################################
-def generate_pdf_from_markdown(markdown_text: str) -> bytes:
-    """
-    Converts Markdown text to a PDF file.
-    Args:
-        markdown_text (str): The Markdown content to convert.
-    Returns:
-        bytes: The generated PDF file in bytes.
-    """
-    try:
-        # Convert Markdown to HTML
-        html = markdown.markdown(markdown_text)
-        # Generate PDF from HTML
-        pdf_bytes = pdfkit.from_string(html, False)  # False to return as bytes
-        return pdf_bytes
-    except Exception as e:
-        print(f"[ERROR] Failed to generate PDF from Markdown: {e}")
-        return b""
-###############################################################################
-# Audio Mixing Function
-###############################################################################
-def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
-    """
-    Mixes spoken audio with background music.
-    Args:
-        spoken (AudioSegment): The spoken audio segment.
-        custom_music_path (str, optional): Path to custom background music. Defaults to None.
-    Returns:
-        AudioSegment: The mixed audio segment.
-    """
-    if custom_music_path:
-        music_path = custom_music_path
-    else:
-        music_path = "bg_music.mp3"
-    if not os.path.exists(music_path):
-        print(f"[ERROR] Background music file not found: {music_path}")
-        return spoken  # Return spoken audio without background music
-    try:
-        bg_music = AudioSegment.from_file(music_path, format="mp3")
-    except Exception as e:
-        print("[ERROR] Failed to load background music:", e)
-        return spoken
-    bg_music = bg_music - 18.0
-    total_length_ms = len(spoken) + 2000
-    looped_music = AudioSegment.empty()
-    while len(looped_music) < total_length_ms:
-        looped_music += bg_music
-    looped_music = looped_music[:total_length_ms]
-    final_mix = looped_music.overlay(spoken, position=2000)
-    return final_mix
-###############################################################################
-# Generate Script Function and Helper
-###############################################################################
-def generate_script(
-    system_prompt: str,
-    input_text: str,
-    tone: str,
-    target_length: str,
-    host_name: str = "Jane",
-    guest_name: str = "John",
-    sponsor_style: str = "Separate Break",
-    sponsor_provided: bool = False
-) -> Dialogue:
-    """
-    Generates a podcast script using DeepSeek R1 via OpenRouter API.
-    Args:
-        system_prompt (str): System-level instructions for the LLM.
-        input_text (str): The main content or topic for the podcast.
-        tone (str): Desired tone of the podcast (e.g., Casual, Formal).
-        target_length (str): Desired length of the podcast (e.g., "3 Mins").
-        host_name (str, optional): Name of the host. Defaults to "Jane".
-        guest_name (str, optional): Name of the guest. Defaults to "John".
-        sponsor_style (str, optional): Style of sponsor integration. Defaults to "Separate Break".
-        sponsor_provided (bool, optional): Whether sponsor content is provided. Defaults to False.
-    Returns:
-        Dialogue: A Dialogue object containing dialogue items.
-    """
-    # Build the user prompt with additional instructions
-    user_prompt = (
-        f"Topic: {input_text}\n"
-        f"Tone: {tone}\n"
-        f"Length: {target_length}\n"
-        f"Host: {host_name or 'Jane'}\n"
-        f"Guest: {guest_name or 'John'}\n"
-    )
-    if sponsor_provided:
-        user_prompt += f"Sponsor Style: {sponsor_style}\n"
-    # Call the DeepSeek API to generate the script
-    try:
-        response = call_deepseek_api_cached(
-            system_prompt=system_prompt,
-            user_prompt=user_prompt,
-            max_tokens=1500,
-            temperature=0.7
-        )
-    except Exception as e:
-        print(f"[ERROR] Failed to generate script: {e}")
-        raise
-    # Parse the response into DialogueItems
-    dialogue_items = parse_script_to_dialogue(response, host_name, guest_name)
-    return Dialogue(dialogue=dialogue_items)
-def parse_script_to_dialogue(script_text: str, host_name: str, guest_name: str) -> List[DialogueItem]:
-    """
-    Parses the script text into a list of DialogueItem objects.
-    Args:
-        script_text (str): The raw script text generated by the LLM.
-        host_name (str): Name of the host.
-        guest_name (str): Name of the guest.
-    Returns:
-        List[DialogueItem]: A list of DialogueItem objects.
-    """
-    # Define a regex pattern to identify lines like "HostName: Dialogue"
-    pattern = rf"(?i)\b({re.escape(host_name)}|{re.escape(guest_name)})\b:\s*(.*)"
-    matches = re.findall(pattern, script_text)
-    dialogue_items = []
-    for speaker, text in matches:
-        speaker_normalized = "Jane" if speaker.lower() == host_name.lower() else "John"
-        item = DialogueItem(
-            speaker=speaker_normalized,
-            display_speaker=speaker,
-            text=text.strip()
-        )
-        dialogue_items.append(item)
-    return dialogue_items
-###############################################################################
-# Generate Audio MP3 Function
-###############################################################################
-def generate_audio_mp3(text: str, speaker: str) -> str:
-    """
-    Generates and returns the actual MP3 file path.
-    Utilizes Deepgram for English (American) and Murf for other languages.
-    Args:
-        text (str): The text to convert to speech.
-        speaker (str): The speaker identifier (e.g., "John", "Jane").
-    Returns:
-        str: The file path to the generated MP3 audio.
-    """
-    try:
-        import streamlit as st
-        print(f"[LOG] Generating audio for speaker: {speaker}")
-        language_selection = st.session_state.get("language_selection", "English (American)")
-        if language_selection == "English (American)":
-            print("[LOG] Using Deepgram TTS for English (American)")
-            # Process text if speaker is not Jane
-            if speaker in ["John", "Jane"]:
-                processed_text = text
-            else:
-                processed_text = _preprocess_text_for_tts(text, speaker)
-            deepgram_api_url = "https://api.deepgram.com/v1/speak"
-            params = {"model": "aura-asteria-en"}
-            if speaker == "John":
-                params["model"] = "aura-zeus-en"
-            headers = {
-                "Accept": "audio/mpeg",
-                "Content-Type": "application/json",
-                "Authorization": f"Token {os.environ.get('DEEPSEEK_API_KEY')}"
-            }
-            body = {"text": processed_text}
-            response = requests.post(deepgram_api_url, params=params, headers=headers, json=body, stream=True)
-            if response.status_code != 200:
-                raise ValueError(f"Deepgram TTS error: {response.status_code}, {response.text}")
-            content_type = response.headers.get('Content-Type', '')
-            if 'audio/mpeg' not in content_type:
-                raise ValueError("Unexpected Content-Type from Deepgram.")
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as mp3_file:
-                for chunk in response.iter_content(chunk_size=8192):
-                    if chunk:
-                        mp3_file.write(chunk)
-                mp3_path = mp3_file.name
-            if not os.path.exists(mp3_path):
-                raise FileNotFoundError(f"Deepgram did not create the MP3 file: {mp3_path}")
-            audio_seg = AudioSegment.from_file(mp3_path, format="mp3")
-            audio_seg = effects.normalize(audio_seg)
-            final_mp3_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
-            audio_seg.export(final_mp3_path, format="mp3")
-            if os.path.exists(mp3_path):
-                os.remove(mp3_path)
-            print(f"[DEBUG] Generated MP3 Path: {final_mp3_path}")
-            if not os.path.exists(final_mp3_path):
-                raise FileNotFoundError(f"Final MP3 file was not created: {final_mp3_path}")
-            return final_mp3_path
-        else:
-            print(f"[LOG] Using Murf API for language: {language_selection}")
-            # Process text if language is Hinglish or Hindi
-            if language_selection == "Hinglish":
-                from indic_transliteration.sanscript import transliterate, DEVANAGARI, IAST
-                text = transliterate(text, DEVANAGARI, IAST)
-            api_key = os.environ.get("MURF_API_KEY")
-            headers = {
-                "Content-Type": "application/json",
-                "Accept": "application/json",
-                "api-key": api_key
-            }
-            multi_native_locale = "hi-IN" if language_selection in ["Hinglish", "Hindi"] else "en-IN"
-            if language_selection == "English (Indian)":
-                voice_id = "en-IN-aarav" if speaker == "John" else "en-IN-isha"
-            elif language_selection in ["Hindi", "Hinglish"]:
-                voice_id = "hi-IN-kabir" if speaker == "John" else "hi-IN-shweta"
-            else:
-                voice_id = "en-IN-aarav" if speaker == "John" else "en-IN-isha"
-            payload = {
-                "audioDuration": 0,
-                "channelType": "MONO",
-                "encodeAsBase64": False,
-                "format": "WAV",
-                "modelVersion": "GEN2",
-                "multiNativeLocale": multi_native_locale,
-                "pitch": 0,
-                "pronunciationDictionary": {},
-                "rate": 0,
-                "sampleRate": 48000,
-                "style": "Conversational",
-                "text": text,
-                "variation": 1,
-                "voiceId": voice_id
-            }
-            response = requests.post("https://api.murf.ai/v1/speech/generate", headers=headers, json=payload)
-            if response.status_code != 200:
-                raise ValueError(f"Murf API error: {response.status_code}, {response.text}")
-            json_resp = response.json()
-            audio_url = json_resp.get("audioFile")
-            if not audio_url:
-                raise ValueError("No audio file URL returned by Murf API")
-            audio_response = requests.get(audio_url)
-            if audio_response.status_code != 200:
-                raise ValueError(f"Error fetching audio from {audio_url}")
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as wav_file:
-                wav_file.write(audio_response.content)
-                wav_path = wav_file.name
-            if not os.path.exists(wav_path):
-                raise FileNotFoundError(f"Murf did not create the WAV file: {wav_path}")
-            audio_seg = AudioSegment.from_file(wav_path, format="wav")
-            audio_seg = effects.normalize(audio_seg)
-            final_mp3_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
-            audio_seg.export(final_mp3_path, format="mp3")
-            os.remove(wav_path)
-            if not os.path.exists(final_mp3_path):
-                raise FileNotFoundError(f"Final MP3 file was not created: {final_mp3_path}")
-            print(f"[DEBUG] Generated MP3 Path: {final_mp3_path}")
-            return final_mp3_path
-    def _preprocess_text_for_tts(text: str, speaker: str) -> str:
-        """
-        Preprocesses text for Text-to-Speech conversion by adding pauses, fillers,
-        and handling specific cases to make the speech sound more natural.
-        Args:
-            text (str): The original text to preprocess.
-            speaker (str): The speaker identifier (e.g., "John", "Jane").
-        Returns:
-            str: The preprocessed text.
-        """
-        # Unchanged logic for adding filler words, etc.
-        text = re.sub(r"\bNo\.\b", "Number", text)
-        text = re.sub(r"\b(?i)SaaS\b", "sass", text)
-        abbreviations_as_words = {"NASA", "NATO", "UNESCO"}
-        def insert_periods_for_abbrev(m):
-            abbr = m.group(0)
-            if abbr in abbreviations_as_words:
-                return abbr
-            return ".".join(list(abbr)) + "."
-        text = re.sub(r"\b([A-Z]{2,})\b", insert_periods_for_abbrev, text)
-        text = re.sub(r"\.\.", ".", text)
-        def remove_periods_for_tts(m):
-            return m.group().replace(".", " ").strip()
-        text = re.sub(r"[A-Z]\.[A-Z](?:\.[A-Z])*\.", remove_periods_for_tts, text)
-        text = re.sub(r"-", " ", text)
-        text = re.sub(r"\b(ha(ha)?|heh|lol)\b", "(* laughs *)", text, flags=re.IGNORECASE)
-        text = re.sub(r"\bsigh\b", "(* sighs *)", text, flags=re.IGNORECASE)
-        text = re.sub(r"\b(groan|moan)\b", "(* groans *)", text, flags=re.IGNORECASE)
-        if speaker != "Jane":
-            def insert_thinking_pause(m):
-                word = m.group(1)
-                if random.random() < 0.3:
-                    filler = random.choice(['hmm,', 'well,', 'let me see,'])
-                    return f"{word}..., {filler}"
-                else:
-                    return f"{word}...,"
-            keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
-            text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
-            conj_pattern = r"\b(and|but|so|because|however)\b"
-            text = re.sub(conj_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
-        text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
-        def capitalize_match(m):
-            return m.group().upper()
-        text = re.sub(r'(^\s*\w)|([.!?]\s*\w)', capitalize_match, text)
-        return text.strip()
-###############################################################################
-# Unified aggregator: google + bing + wiki + rss + event registry + fallback
-###############################################################################
 def perform_deep_research(topic: str) -> str:
-    """
-    Perform deep research by aggregating data from multiple sources.
-    Limits the number of sources to prevent exceeding token limits.
-    Summarizes each source's content to reduce token count.
-    Args:
-        topic (str): The research topic.
-    Returns:
-        str: The final professional report in Markdown format.
-    """
-    # Define the maximum number of sources per aggregator
-    MAX_SOURCES_PER_AGGREGATOR = 5
-    # Step 1: Google
     google_cse_id = os.environ.get("GOOGLE_CSE_ID")
     google_api_key = os.environ.get("GOOGLE_API_KEY")
     google_sources = []
@@ -867,12 +200,12 @@ def perform_deep_research(topic: str) -> str:
                 "q": topic,
                 "cx": google_cse_id,
                 "key": google_api_key,
-                "num": 10  # Fetch more to account for filtering
             }
             resp = requests.get(url, params=params, timeout=15)
             resp.raise_for_status()
             data = resp.json()
-            items = data.get("items", [])[:MAX_SOURCES_PER_AGGREGATOR]
             for it in items:
                 google_sources.append({
                     "title": it.get("title", ""),
@@ -881,11 +214,7 @@ def perform_deep_research(topic: str) -> str:
                 })
         except Exception as e:
             print("[ERROR] Google approach failed:", e)
-    # Step 2: Bing
-    bing_results = fetch_bing_results(topic, count=10)[:MAX_SOURCES_PER_AGGREGATOR]
-    # Step 3: Wikipedia summary
     wiki_summary_text = fetch_wikipedia_summary(topic)
     wiki_item = None
     if wiki_summary_text:
@@ -894,9 +223,6 @@ def perform_deep_research(topic: str) -> str:
             "link": f"https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}",
             "snippet": wiki_summary_text
         }
-    # Step 4: RSS approach (NewsAPI assumed here)
-    rss_sources = []
     sources_dict = {
         "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
         "CNN": "http://rss.cnn.com/rss/edition.rss",
@@ -906,7 +232,8 @@ def perform_deep_research(topic: str) -> str:
         "The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
         "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
     }
-    for name, feed_url in list(sources_dict.items())[:MAX_SOURCES_PER_AGGREGATOR]:
         try:
             items = fetch_rss_feed(feed_url)
             if not items:
@@ -929,11 +256,7 @@ def perform_deep_research(topic: str) -> str:
         except Exception as e:
             print(f"[ERROR] Error fetching from {name} RSS feed:", e)
             continue
-    # Step 5: Event Registry
-    event_registry_res = fetch_eventregistry_articles(topic, count=10)[:MAX_SOURCES_PER_AGGREGATOR]
-    # Combine all sources
     combined = []
     combined.extend(google_sources)
     combined.extend(bing_results)
@@ -941,10 +264,8 @@ def perform_deep_research(topic: str) -> str:
         combined.append(wiki_item)
     combined.extend(rss_sources)
     combined.extend(event_registry_res)
     if not combined:
         print("[LOG] No results found from aggregator. Using LLM fallback.")
-        # LLM-based fallback
         fallback_text = query_llm_for_additional_info(topic, "")
         cleaned_fb = rewrite_in_professional_style(topic, fallback_text)
         fallback_data = [{
@@ -955,22 +276,231 @@ def perform_deep_research(topic: str) -> str:
         }]
         return _draft_professional_report(topic, fallback_data)
     else:
-        # Summarize each source's snippet to reduce token count
-        summarized_list = []
-        for idx, source in enumerate(combined, start=1):
-            summary = summarize_text(source["snippet"], max_length=200)  # Summarize to 200 words
-            summarized_list.append({
-                "index": idx,
-                "title": source["title"],
-                "link": source["link"],
-                "cleaned_text": summary
-            })
-        return _draft_professional_report(topic, summarized_list)
-###############################################################################
-# Additional Helper Functions (if any)
-###############################################################################
 def _spell_digits(d: str) -> str:
     digit_map = {
@@ -980,6 +510,21 @@ def _spell_digits(d: str) -> str:
     }
     return " ".join(digit_map[ch] for ch in d if ch in digit_map)
-###############################################################################
-# End of utils.py
-###############################################################################

 import os
 import re
+import json
 import requests
 import tempfile
 from bs4 import BeautifulSoup
+from typing import List, Literal
+from pydantic import BaseModel
 from pydub import AudioSegment, effects
+from transformers import pipeline
+import yt_dlp
 import tiktoken
+import numpy as np
+import torch
 import random
+import base64
+from io import BytesIO
+import pdfkit
+import markdown  # For Markdown to HTML conversion
+# ------------------------------
+# Data models
+# ------------------------------
 class DialogueItem(BaseModel):
+    speaker: Literal["Jane", "John"]
+    display_speaker: str = "Jane"
     text: str
 class Dialogue(BaseModel):
     dialogue: List[DialogueItem]
+# ------------------------------
+# ASR Pipeline setup
+# ------------------------------
+asr_pipeline = pipeline(
+    "automatic-speech-recognition",
+    model="openai/whisper-tiny.en",
+    device=0 if torch.cuda.is_available() else -1
+)
+# ------------------------------
+# Helper functions
+# ------------------------------
+def truncate_text(text, max_tokens=2048):
+    print("[LOG] Truncating text if needed.")
+    tokenizer = tiktoken.get_encoding("cl100k_base")
+    tokens = tokenizer.encode(text)
+    if len(tokens) > max_tokens:
+        print("[LOG] Text too long, truncating.")
+        return tokenizer.decode(tokens[:max_tokens])
+    return text
+def extract_text_from_url(url):
     print("[LOG] Extracting text from URL:", url)
     try:
         headers = {
         print(f"[ERROR] Exception during text extraction from URL: {e}")
         return ""
+def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
+    print(f"[LOG] Shifting pitch by {semitones} semitones.")
+    new_sample_rate = int(audio.frame_rate * (2.0 ** (semitones / 12.0)))
+    shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
+    return shifted_audio.set_frame_rate(audio.frame_rate)
+def is_sufficient(text: str, min_word_count: int = 500) -> bool:
+    word_count = len(text.split())
+    print(f"[DEBUG] Aggregated word count: {word_count}")
+    return word_count >= min_word_count
+# ------------------------------
+# Text rewriting using DeepSeek (via OpenRouter)
+# ------------------------------
+def rewrite_in_professional_style(topic: str, raw_text: str) -> str:
+    if not raw_text.strip():
         return ""
+    system_prompt = (
+        "You are a professional writing assistant. Your goal is to rewrite "
+        "the provided text so that it is:\n"
+        "1) Written in clear, fluent, professional English\n"
+        f"2) On-topic about {topic}, removing any extraneous disclaimers or filler\n"
+        "3) Organized in paragraphs or bullet points\n"
+        "4) Maintained or slightly enhanced in detail without significant summarization\n"
+        "5) No references to the rewriting process or disclaimers\n"
+    )
+    user_prompt = f"Please rewrite this text:\n\n{raw_text}"
     try:
+        response = call_deepseek_api(
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+            max_tokens=1024,
+            temperature=0.7
+        )
+        return response.strip()
     except Exception as e:
+        print("[ERROR] rewriting text via Deepseek LLM failed:", e)
+        return raw_text
+# ------------------------------
+# Event Registry aggregator
+# ------------------------------
 def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
     news_api_key = os.environ.get("NEWS_API_KEY")
     if not news_api_key:
         print("[ERROR] Missing NEWS_API_KEY for Event Registry.")
         return []
     print("[LOG] Attempting Event Registry for topic:", topic)
     endpoint = "https://eventregistry.org/api/v1/article/getArticles"
     body = {
         "action": "getArticles",
         "keyword": topic,
         "articlesPage": 1,
+        "articlesCount": count,
         "articlesSortBy": "date",
         "articlesSortByAsc": False,
         "dataType": ["news", "pr"],
+        "forceMaxDataTimeWindow": 31,
         "resultType": "articles",
         "apiKey": news_api_key
     }
     try:
         resp = requests.post(endpoint, json=body, timeout=20)
         resp.raise_for_status()
         data = resp.json()
         art_data = data.get("articles", {})
         results_arr = art_data.get("results", [])
         ret = []
         for item in results_arr:
             title = item.get("title", "")
             url = item.get("url", "")
             snippet = item.get("body", "") or item.get("excerpt", "")
             ret.append({"title": title, "link": url, "snippet": snippet})
         return ret
         print("[ERROR] Event Registry approach failed:", e)
         return []
+# ------------------------------
+# Bing results via SerpApi
+# ------------------------------
 def fetch_bing_results(query: str, count: int = 10) -> list:
     serp_api_key = os.environ.get("SERP_API_KEY")
     if not serp_api_key:
         print("[ERROR] Bing SerpApi approach failed:", e)
         return []
+# ------------------------------
+# Unified deep research aggregator
+# ------------------------------
 def perform_deep_research(topic: str) -> str:
+    # Limit each source to a maximum of 5 items
     google_cse_id = os.environ.get("GOOGLE_CSE_ID")
     google_api_key = os.environ.get("GOOGLE_API_KEY")
     google_sources = []
                 "q": topic,
                 "cx": google_cse_id,
                 "key": google_api_key,
+                "num": 5
             }
             resp = requests.get(url, params=params, timeout=15)
             resp.raise_for_status()
             data = resp.json()
+            items = data.get("items", [])
             for it in items:
                 google_sources.append({
                     "title": it.get("title", ""),
                 })
         except Exception as e:
             print("[ERROR] Google approach failed:", e)
+    bing_results = fetch_bing_results(topic, count=5)
     wiki_summary_text = fetch_wikipedia_summary(topic)
     wiki_item = None
     if wiki_summary_text:
             "link": f"https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}",
             "snippet": wiki_summary_text
         }
     sources_dict = {
         "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
         "CNN": "http://rss.cnn.com/rss/edition.rss",
         "The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
         "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
     }
+    rss_sources = []
+    for name, feed_url in sources_dict.items():
         try:
             items = fetch_rss_feed(feed_url)
             if not items:
         except Exception as e:
             print(f"[ERROR] Error fetching from {name} RSS feed:", e)
             continue
+    event_registry_res = fetch_eventregistry_articles(topic, count=5)
     combined = []
     combined.extend(google_sources)
     combined.extend(bing_results)
         combined.append(wiki_item)
     combined.extend(rss_sources)
     combined.extend(event_registry_res)
     if not combined:
         print("[LOG] No results found from aggregator. Using LLM fallback.")
         fallback_text = query_llm_for_additional_info(topic, "")
         cleaned_fb = rewrite_in_professional_style(topic, fallback_text)
         fallback_data = [{
         }]
         return _draft_professional_report(topic, fallback_data)
     else:
+        final_list = []
+        idx = 0
+        for source in combined:
+            idx += 1
+            link = source.get("link", "")
+            snippet = source.get("snippet", "")
+            title = source.get("title", "")
+            cleaned_text = rewrite_in_professional_style(topic, snippet)
+            if cleaned_text.strip():
+                final_list.append({
+                    "index": idx,
+                    "title": title,
+                    "link": link,
+                    "cleaned_text": cleaned_text
+                })
+        if not final_list:
+            print("[LOG] Aggregator produced no final content after rewriting. Using LLM fallback.")
+            fallback_text = query_llm_for_additional_info(topic, "")
+            cleaned_fb = rewrite_in_professional_style(topic, fallback_text)
+            fallback_data = [{
+                "index": 1,
+                "title": "Fallback Info",
+                "link": "N/A",
+                "cleaned_text": cleaned_fb
+            }]
+            return _draft_professional_report(topic, fallback_data)
+        return _draft_professional_report(topic, final_list)
+def _draft_professional_report(topic: str, sources_list: list) -> str:
+    merged_text = []
+    footnotes = []
+    for s in sources_list:
+        footnotes.append(f"[^{s['index']}]: {s['link']}")
+        text_block = (
+            f"Source {s['index']} Title: {s['title']}\n"
+            f"FootnoteRef: [^{s['index']}]\n"
+            f"Text:\n{s['cleaned_text']}\n"
+        )
+        merged_text.append(text_block)
+    all_content = "\n\n".join(merged_text)
+    system_prompt = f"""You are a highly skilled professional research analyst.
+You have access to multiple authoritative sources on the topic: {topic}.
+Your task is to produce a comprehensive and detailed formal research report that includes the following sections:
+1. **Title:** Use the topic as the title of the report.
+2. **Executive Summary:** Provide a concise overview highlighting the key findings and insights.
+3. **Introduction:** Introduce the topic, its relevance, and the scope of the report.
+4. **Main Body:**
+    - **Sub-heading 1:** Summarize insights from Source 1.
+    - **Sub-heading 2:** Summarize insights from Source 2.
+    - *(Continue as needed for all sources)*
+    - **Analysis:** Provide an in-depth analysis combining information from all sources.
+5. **Conclusion:** Present final thoughts, implications, and potential future directions.
+6. **References:** List all sources with numeric footnote markers [^1], [^2], etc.
+**Requirements:**
+- **Length:** The report must be at least **1,000 words** in total.
+- **Content Quality:**
+    - Incorporate relevant facts, figures, and statistics.
+    - Use professional and clear language.
+    - Ensure each section is well-developed without unnecessary repetition.
+- **Structure:** Maintain a logical and cohesive flow throughout the report.
+- **Formatting:** Use proper formatting for headings, sub-headings, and references.
+**Below is the aggregated content from your sources (with footnote references):**
+-----------------------------------------------------------------------
+{all_content}
+-----------------------------------------------------------------------
+**Footnotes:**
+{chr(10).join(footnotes)}
+"""
+    def count_tokens(text: str) -> int:
+        tokenizer = tiktoken.get_encoding("cl100k_base")
+        tokens = tokenizer.encode(text)
+        return len(tokens)
+    max_tokens = 6000
+    system_prompt_tokens = count_tokens(system_prompt)
+    all_content_tokens = count_tokens(all_content)
+    total_tokens = system_prompt_tokens + all_content_tokens
+    print(f"[DEBUG] Total tokens before optimization: {total_tokens}")
+    if total_tokens > max_tokens:
+        allowed_tokens_for_content = max_tokens - system_prompt_tokens - 100
+        if allowed_tokens_for_content <= 0:
+            print("[ERROR] System prompt alone exceeds the token limit.")
+            return "The system prompt exceeds the token limit. Please reduce the complexity of your research."
+        tokenizer = tiktoken.get_encoding("cl100k_base")
+        all_content_tokens_list = tokenizer.encode(all_content)
+        truncated_tokens = all_content_tokens_list[:allowed_tokens_for_content]
+        truncated_content = tokenizer.decode(truncated_tokens)
+        system_prompt = system_prompt.replace(all_content, truncated_content + "\n\n[Content truncated to fit token limits.]")
+        print(f"[DEBUG] Truncated content to fit token limits: {len(truncated_tokens)} tokens")
+    try:
+        response = call_deepseek_api(
+            system_prompt=system_prompt,
+            user_prompt="",
+            max_tokens=3000,
+            temperature=0.7
+        )
+        final_report = response.strip()
+        word_count = len(final_report.split())
+        if word_count < 1000:
+            print(f"[WARNING] Generated report is below desired length: {word_count} words.")
+        return final_report
+    except Exception as e:
+        print("[ERROR] Could not finalize professional report:", e)
+        return "An unexpected error occurred. Please try again later."
+def call_deepseek_api(system_prompt: str, user_prompt: str, max_tokens: int, temperature: float) -> str:
+    print("[LOG] Communicating with DeepSeek R1 via OpenRouter API.")
+    try:
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('DEEPSEEK_API_KEY')}",
+            "HTTP-Referer": "https://yourdomain.com",  # Replace with your site URL if needed
+            "X-Title": "MyPod",  # Replace with your site name if needed
+            "Content-Type": "application/json"
+        }
+        data = {
+            "model": "deepseek/deepseek-r1:free",
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ],
+            "max_tokens": max_tokens,
+            "temperature": temperature
+        }
+        response = requests.post("https://openrouter.ai/api/v1/chat/completions",
+                                 headers=headers, data=json.dumps(data))
+        response.raise_for_status()
+        json_response = response.json()
+        if "choices" not in json_response:
+            raise ValueError("Invalid response from OpenRouter API: 'choices' key missing.")
+        return json_response["choices"][0]["message"]["content"]
+    except requests.exceptions.HTTPError as e:
+        status_code = e.response.status_code
+        if status_code == 503:
+            print("[ERROR] Service Unavailable from Deepseek API.")
+            raise ValueError("Service is currently unavailable. Please try again later.")
+        elif status_code == 413:
+            print("[ERROR] Request too large for Deepseek API.")
+            raise ValueError("The request is too large. Please reduce the input size and try again.")
+        else:
+            print("[ERROR] Deepseek API error:", e)
+            raise ValueError("An error occurred while generating the report. Please try again later.")
+    except Exception as e:
+        print("[ERROR] Could not communicate with Deepseek API:", e)
+        raise ValueError("An unexpected error occurred. Please try again later.")
+def generate_pdf_from_markdown(markdown_text: str) -> bytes:
+    try:
+        html = markdown.markdown(markdown_text, extensions=["extra", "tables", "toc"])
+        pdf_bytes = pdfkit.from_string(html, False)
+        return pdf_bytes
+    except Exception as e:
+        print(f"[ERROR] Failed to generate PDF from Markdown: {e}")
+        return b""
+def fetch_wikipedia_summary(topic: str) -> str:
+    print("[LOG] Fetching Wikipedia summary for:", topic)
+    try:
+        search_url = (
+            f"https://en.wikipedia.org/w/api.php?action=opensearch&search={requests.utils.quote(topic)}"
+            "&limit=1&namespace=0&format=json"
+        )
+        resp = requests.get(search_url)
+        if resp.status_code != 200:
+            print(f"[ERROR] Failed to fetch Wikipedia search results for {topic}")
+            return ""
+        data = resp.json()
+        if len(data) > 1 and data[1]:
+            title = data[1][0]
+            summary_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(title)}"
+            s_resp = requests.get(summary_url)
+            if s_resp.status_code == 200:
+                s_data = s_resp.json()
+                if "extract" in s_data:
+                    print("[LOG] Wikipedia summary fetched successfully.")
+                    return s_data["extract"]
+        return ""
+    except Exception as e:
+        print(f"[ERROR] Exception during Wikipedia summary fetch: {e}")
+        return ""
+def fetch_rss_feed(feed_url: str) -> list:
+    print("[LOG] Fetching RSS feed:", feed_url)
+    try:
+        resp = requests.get(feed_url)
+        if resp.status_code != 200:
+            print(f"[ERROR] Failed to fetch RSS feed: {feed_url}")
+            return []
+        soup = BeautifulSoup(resp.content, "xml")
+        items = soup.find_all("item")
+        return items
+    except Exception as e:
+        print(f"[ERROR] Exception fetching RSS feed {feed_url}: {e}")
+        return []
+def find_relevant_article(items, topic: str, min_match=2) -> tuple:
+    print("[LOG] Finding relevant articles...")
+    keywords = re.findall(r'\w+', topic.lower())
+    for item in items:
+        title = item.find("title").get_text().strip() if item.find("title") else ""
+        description = item.find("description").get_text().strip() if item.find("description") else ""
+        text = (title + " " + description).lower()
+        matches = sum(1 for kw in keywords if kw in text)
+        if matches >= min_match:
+            link = item.find("link").get_text().strip() if item.find("link") else ""
+            print(f"[LOG] Relevant article found: {title}")
+            return title, description, link
+    return None, None, None
+# ------------------------------
+# Preprocess text for TTS
+# ------------------------------
+def _preprocess_text_for_tts(text: str, speaker: str) -> str:
+    text = re.sub(r"\bNo\.\b", "Number", text)
+    text = re.sub(r"\b(?i)SaaS\b", "sass", text)
+    abbreviations_as_words = {"NASA", "NATO", "UNESCO"}
+    def insert_periods_for_abbrev(m):
+        abbr = m.group(0)
+        if abbr in abbreviations_as_words:
+            return abbr
+        return ".".join(list(abbr)) + "."
+    text = re.sub(r"\b([A-Z]{2,})\b", insert_periods_for_abbrev, text)
+    text = re.sub(r"\.\.", ".", text)
+    return text
 def _spell_digits(d: str) -> str:
     digit_map = {
     }
     return " ".join(digit_map[ch] for ch in d if ch in digit_map)
+def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
+    if custom_music_path:
+        music_path = custom_music_path
+    else:
+        music_path = "bg_music.mp3"
+    try:
+        bg_music = AudioSegment.from_file(music_path, format="mp3")
+    except Exception as e:
+        print("[ERROR] Failed to load background music:", e)
+        return spoken
+    bg_music = bg_music - 18.0
+    total_length_ms = len(spoken) + 2000
+    looped_music = AudioSegment.empty()
+    while len(looped_music) < total_length_ms:
+        looped_music += bg_music
+    looped_music = looped_music[:total_length_ms]
+    final_mix = looped_music.overlay(spoken, position=2000)
+    return final_mix