SearchPod1.0

Running

App Files Files Community

siddhartharyaai commited on Jan 29

Commit

e19a7d9

verified ·

1 Parent(s): 14236f7

Create utils.py

Browse files

Files changed (1) hide show

utils.py +729 -0

utils.py ADDED Viewed

	@@ -0,0 +1,729 @@

+import os
+import re
+import json
+import requests
+import tempfile
+from bs4 import BeautifulSoup
+from typing import List, Literal
+from pydantic import BaseModel
+from pydub import AudioSegment, effects
+from transformers import pipeline
+import yt_dlp
+import tiktoken
+from groq import Groq  # Retained for other functions if needed
+import numpy as np
+import torch
+import random
+import base64
+from io import BytesIO
+import altair as alt
+import pdfkit
+import altair_saver  # For PNG export with Altair
+###############################################################################
+# Pydantic Models
+###############################################################################
+class DialogueItem(BaseModel):
+    speaker: Literal["Jane", "John"]
+    display_speaker: str = "Jane"
+    text: str
+class Dialogue(BaseModel):
+    dialogue: List[DialogueItem]
+###############################################################################
+# ASR Pipeline (Whisper tiny)
+###############################################################################
+asr_pipeline = pipeline(
+    "automatic-speech-recognition",
+    model="openai/whisper-tiny.en",
+    device=0 if torch.cuda.is_available() else -1
+)
+###############################################################################
+# Helper: Truncate text if it exceeds token limit
+###############################################################################
+def truncate_text(text, max_tokens=2048):
+    print("[LOG] Truncating text if needed.")
+    tokenizer = tiktoken.get_encoding("cl100k_base")
+    tokens = tokenizer.encode(text)
+    if len(tokens) > max_tokens:
+        print("[LOG] Text too long, truncating.")
+        return tokenizer.decode(tokens[:max_tokens])
+    return text
+###############################################################################
+# Extract text from a URL
+###############################################################################
+def extract_text_from_url(url):
+    print("[LOG] Extracting text from URL:", url)
+    try:
+        headers = {
+            "User-Agent": (
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                "Chrome/115.0.0.0 Safari/537.36"
+            )
+        }
+        response = requests.get(url, headers=headers)
+        if response.status_code != 200:
+            print(f"[ERROR] Failed to fetch URL: {url} with status code {response.status_code}")
+            return ""
+        soup = BeautifulSoup(response.text, 'html.parser')
+        for script in soup(["script", "style"]):
+            script.decompose()
+        text = soup.get_text(separator=' ')
+        print("[LOG] Text extraction from URL successful.")
+        return text
+    except Exception as e:
+        print(f"[ERROR] Exception during text extraction from URL: {e}")
+        return ""
+###############################################################################
+# Optional pitch-shift (unused)
+###############################################################################
+def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
+    print(f"[LOG] Shifting pitch by {semitones} semitones.")
+    new_sample_rate = int(audio.frame_rate * (2.0 ** (semitones / 12.0)))
+    shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
+    return shifted_audio.set_frame_rate(audio.frame_rate)
+###############################################################################
+# Check if text is sufficient
+###############################################################################
+def is_sufficient(text: str, min_word_count: int = 500) -> bool:
+    word_count = len(text.split())
+    print(f"[DEBUG] Aggregated word count: {word_count}")
+    return word_count >= min_word_count
+###############################################################################
+# LLM fallback if insufficient data
+###############################################################################
+def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
+    print("[LOG] Querying LLM for additional info.")
+    system_prompt = (
+        "You are an AI assistant with extensive knowledge up to 2023-10. "
+        "Provide additional relevant information on the following topic based on your knowledge base.\n\n"
+        f"Topic: {topic}\n\n"
+        f"Existing Information: {existing_text}\n\n"
+        "Please add more insightful details, facts, and perspectives."
+    )
+    groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+    try:
+        response = groq_client.chat.completions.create(
+            messages=[{"role": "system", "content": system_prompt}],
+            model="llama-3.3-70b-versatile",
+            max_tokens=1024,
+            temperature=0.7
+        )
+    except Exception as e:
+        print("[ERROR] Groq API error during fallback:", e)
+        return ""
+    info = response.choices[0].message.content.strip()
+    print("[DEBUG] Additional info from LLM:")
+    print(info)
+    return info
+###############################################################################
+# Rewrite text in professional style
+###############################################################################
+def rewrite_in_professional_style(topic: str, raw_text: str) -> str:
+    if not raw_text.strip():
+        return ""
+    system_prompt = (
+        "You are a professional writing assistant. Rewrite the provided text:\n"
+        "1) Use clear, fluent, professional English.\n"
+        "2) Keep it on-topic about {topic}, removing disclaimers or non-English filler.\n"
+        "3) Summarize if too long, but keep important data/facts.\n"
+        "4) Organize in paragraphs/bullet points.\n"
+        "5) Avoid referencing any rewriting.\n"
+    ).format(topic=topic)
+    user_prompt = f"Please rewrite this text:\n\n{raw_text}"
+    groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+    try:
+        response = groq_client.chat.completions.create(
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ],
+            model="llama-3.3-70b-versatile",
+            max_tokens=1024,
+            temperature=0.7
+        )
+        return response.choices[0].message.content.strip()
+    except Exception as e:
+        print("[ERROR] rewriting text via LLM:", e)
+        return raw_text
+###############################################################################
+# Legacy research: RSS + Wikipedia
+###############################################################################
+def research_topic(topic: str) -> str:
+    sources = {
+        "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
+        "CNN": "http://rss.cnn.com/rss/edition.rss",
+        "Associated Press": "https://apnews.com/apf-topnews",
+        "NDTV": "https://www.ndtv.com/rss/top-stories",
+        "Times of India": "https://timesofindia.indiatimes.com/rssfeeds/296589292.cms",
+        "The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
+        "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
+        "Google News - Custom": f"https://news.google.com/rss/search?q={requests.utils.quote(topic)}&hl=en-IN&gl=IN&ceid=IN:en",
+    }
+    summary_parts = []
+    wiki_summary = fetch_wikipedia_summary(topic)
+    if wiki_summary:
+        summary_parts.append(f"From Wikipedia: {wiki_summary}")
+    for name, feed_url in sources.items():
+        try:
+            items = fetch_rss_feed(feed_url)
+            if not items:
+                continue
+            title, desc, link = find_relevant_article(items, topic, min_match=2)
+            if link:
+                article_text = fetch_article_text(link)
+                if article_text:
+                    summary_parts.append(f"From {name}: {article_text}")
+                else:
+                    summary_parts.append(f"From {name}: {title} - {desc}")
+        except Exception as e:
+            print(f"[ERROR] Error fetching from {name} RSS feed:", e)
+            continue
+    aggregated_info = " ".join(summary_parts)
+    print("[DEBUG] Aggregated info from primary sources:")
+    print(aggregated_info)
+    if not is_sufficient(aggregated_info):
+        print("[LOG] Not enough info. LLM fallback.")
+        extra_info = query_llm_for_additional_info(topic, aggregated_info)
+        if extra_info:
+            aggregated_info += " " + extra_info
+        else:
+            print("[ERROR] LLM fallback gave nothing.")
+    if not aggregated_info:
+        return f"Sorry, no info on '{topic}'."
+    return aggregated_info
+def fetch_wikipedia_summary(topic: str) -> str:
+    print("[LOG] Fetching Wikipedia summary for:", topic)
+    try:
+        search_url = (
+            f"https://en.wikipedia.org/w/api.php?action=opensearch&search={requests.utils.quote(topic)}"
+            "&limit=1&namespace=0&format=json"
+        )
+        resp = requests.get(search_url)
+        if resp.status_code != 200:
+            print(f"[ERROR] Wikipedia fetch fail for {topic}")
+            return ""
+        data = resp.json()
+        if len(data) > 1 and data[1]:
+            title = data[1][0]
+            summary_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(title)}"
+            s_resp = requests.get(summary_url)
+            if s_resp.status_code == 200:
+                s_data = s_resp.json()
+                if "extract" in s_data:
+                    print("[LOG] Wikipedia summary found.")
+                    return s_data["extract"]
+        return ""
+    except Exception as e:
+        print(f"[ERROR] Wikipedia summary error: {e}")
+        return ""
+def fetch_rss_feed(feed_url: str) -> list:
+    print("[LOG] RSS feed:", feed_url)
+    try:
+        resp = requests.get(feed_url)
+        if resp.status_code != 200:
+            print(f"[ERROR] RSS feed fail: {feed_url}")
+            return []
+        soup = BeautifulSoup(resp.content, "xml")
+        return soup.find_all("item")
+    except Exception as e:
+        print(f"[ERROR] RSS error: {e}")
+        return []
+def find_relevant_article(items, topic: str, min_match=2) -> tuple:
+    print("[LOG] Searching relevant article...")
+    keywords = re.findall(r'\w+', topic.lower())
+    for item in items:
+        title = item.find("title").get_text().strip() if item.find("title") else ""
+        description = item.find("description").get_text().strip() if item.find("description") else ""
+        text = (title + " " + description).lower()
+        matches = sum(1 for kw in keywords if kw in text)
+        if matches >= min_match:
+            link = item.find("link").get_text().strip() if item.find("link") else ""
+            print(f"[LOG] Relevant article found: {title}")
+            return title, description, link
+    return None, None, None
+def fetch_article_text(link: str) -> str:
+    print("[LOG] Fetching article text from:", link)
+    if not link:
+        print("[LOG] No link.")
+        return ""
+    try:
+        r = requests.get(link)
+        if r.status_code != 200:
+            print(f"[ERROR] Article fetch fail: {link}")
+            return ""
+        soup = BeautifulSoup(r.text, 'html.parser')
+        paragraphs = soup.find_all("p")
+        text = " ".join(p.get_text() for p in paragraphs[:5])
+        print("[LOG] Article text fetched.")
+        return text.strip()
+    except Exception as e:
+        print(f"[ERROR] fetch_article_text error: {e}")
+        return ""
+###############################################################################
+# Script generation for podcasts
+###############################################################################
+def generate_script(
+    system_prompt: str,
+    input_text: str,
+    tone: str,
+    target_length: str,
+    host_name: str = "Jane",
+    guest_name: str = "John",
+    sponsor_style: str = "Separate Break",
+    sponsor_provided=None
+):
+    import streamlit as st
+    print("[LOG] Generating script. Tone:", tone, "Length:", target_length)
+    if (host_name == "Jane" or not host_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
+        host_name = "Isha"
+    if (guest_name == "John" or not guest_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
+        guest_name = "Aarav"
+    words_per_minute = 150
+    numeric_minutes = 3
+    match = re.search(r"(\d+)", target_length)
+    if match:
+        numeric_minutes = int(match.group(1))
+    min_words = max(50, numeric_minutes * 100)
+    max_words = numeric_minutes * words_per_minute
+    tone_map = {
+        "Humorous": "funny and exciting",
+        "Formal": "business-like, well-structured, professional",
+        "Casual": "like a conversation between close friends",
+        "Youthful": "energetic and lively"
+    }
+    chosen_tone = tone_map.get(tone, "casual")
+    if sponsor_provided:
+        if sponsor_style == "Separate Break":
+            sponsor_instructions = "If sponsor content is provided, place in a separate ad break (~30s)."
+        else:
+            sponsor_instructions = "If sponsor content is provided, blend (~30s) into the conversation."
+    else:
+        sponsor_instructions = ""
+    prompt = (
+        f"{system_prompt}\n"
+        f"TONE: {chosen_tone}\n"
+        f"TARGET LENGTH: {target_length} (~{min_words}-{max_words} words)\n"
+        f"INPUT TEXT: {input_text}\n\n"
+        f"# Sponsor Style Instruction:\n{sponsor_instructions}\n\n"
+        "Output must be JSON:\n"
+        "{\n"
+        '    "dialogue": [\n'
+        '        {"speaker": "Jane", "text": "..."},\n'
+        '        {"speaker": "John", "text": "..."}\n'
+        "    ]\n"
+        "}"
+    )
+    print("[LOG] Prompt to LLM:", prompt)
+    if st.session_state.get("language_selection") == "Hinglish":
+        prompt += "\n\nPlease generate the script in Romanized Hindi.\n"
+    elif st.session_state.get("language_selection") == "Hindi":
+        prompt += "\n\nPlease generate the script exclusively in Hindi.\n"
+    try:
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('DEEPSEEK_API_KEY')}",
+            "Content-Type": "application/json"
+        }
+        data = {
+            "model": "deepseek/deepseek-r1",
+            "messages": [{"role": "user", "content": prompt}],
+            "max_tokens": 2048,
+            "temperature": 0.7
+        }
+        r = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, data=json.dumps(data))
+        r.raise_for_status()
+        raw_content = r.json()["choices"][0]["message"]["content"].strip()
+    except Exception as e:
+        print("[ERROR] LLM error:", e)
+        raise ValueError(f"Error with LLM call: {e}")
+    start_index = raw_content.find('{')
+    end_index = raw_content.rfind('}')
+    if start_index == -1 or end_index == -1:
+        raise ValueError("No JSON found in LLM response.")
+    json_str = raw_content[start_index:end_index+1].strip()
+    try:
+        data_js = json.loads(json_str)
+        diag_list = data_js.get("dialogue", [])
+        for d in diag_list:
+            raw_speaker = d.get("speaker", "Jane")
+            if raw_speaker.lower() == host_name.lower():
+                d["speaker"] = "Jane"
+                d["display_speaker"] = host_name
+            elif raw_speaker.lower() == guest_name.lower():
+                d["speaker"] = "John"
+                d["display_speaker"] = guest_name
+            else:
+                d["speaker"] = "Jane"
+                d["display_speaker"] = raw_speaker
+        final_items = []
+        for d in diag_list:
+            if "display_speaker" not in d:
+                d["display_speaker"] = d["speaker"]
+            final_items.append(DialogueItem(**d))
+        return Dialogue(dialogue=final_items)
+    except Exception as e:
+        print("[ERROR] JSON parse error:", e)
+        raise ValueError(f"Failed to parse JSON from LLM: {e}")
+###############################################################################
+# Transcribe YouTube (RapidAPI)
+###############################################################################
+def transcribe_youtube_video(video_url: str) -> str:
+    print("[LOG] Transcribing YouTube via RapidAPI:", video_url)
+    vid_match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", video_url)
+    if not vid_match:
+        raise ValueError("Invalid YouTube URL, cannot find video ID.")
+    video_id = vid_match.group(1)
+    base_url = "https://youtube-transcriptor.p.rapidapi.com/transcript"
+    params = {"video_id": video_id, "lang": "en"}
+    headers = {
+        "x-rapidapi-host": "youtube-transcriptor.p.rapidapi.com",
+        "x-rapidapi-key": os.environ.get("RAPIDAPI_KEY")
+    }
+    try:
+        r = requests.get(base_url, headers=headers, params=params, timeout=30)
+        r.raise_for_status()
+        data = r.json()
+        if not isinstance(data, list) or not data:
+            raise ValueError("No transcript data returned.")
+        text = data[0].get('transcriptionAsText', '').strip()
+        if not text:
+            raise ValueError("Transcript is empty.")
+        return text
+    except Exception as e:
+        print("[ERROR] RapidAPI transcription error:", e)
+        raise ValueError(f"Error transcribing YouTube: {e}")
+###############################################################################
+# TTS => mp3 file path
+###############################################################################
+def generate_audio_mp3(text: str, speaker: str) -> str:
+    import streamlit as st
+    print(f"[LOG] Generating audio for speaker: {speaker}")
+    language = st.session_state.get("language_selection", "English (American)")
+    if language == "English (American)":
+        # DEEPGRAM approach
+        ...
+    else:
+        # MURF approach
+        ...
+    return "...some_mp3_file_path..."
+def transcribe_youtube_video_OLD_YTDLP(video_url: str):
+    pass
+def _preprocess_text_for_tts(text: str, speaker: str) -> str:
+    return text
+def _spell_digits(d: str) -> str:
+    return ""
+###############################################################################
+# Mix with BG music
+###############################################################################
+def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
+    if custom_music_path:
+        music_path = custom_music_path
+    else:
+        music_path = "bg_music.mp3"
+    try:
+        bg_music = AudioSegment.from_file(music_path, format="mp3")
+    except Exception as e:
+        print("[ERROR] Could not load bg music:", e)
+        return spoken
+    bg_music = bg_music - 18.0
+    total_len = len(spoken) + 2000
+    looped = AudioSegment.empty()
+    while len(looped) < total_len:
+        looped += bg_music
+    looped = looped[:total_len]
+    final_mix = looped.overlay(spoken, position=2000)
+    return final_mix
+###############################################################################
+# Q&A
+###############################################################################
+def call_groq_api_for_qa(system_prompt: str) -> str:
+    try:
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('DEEPSEEK_API_KEY')}",
+            "Content-Type": "application/json",
+            "Accept": "application/json"
+        }
+        data = {
+            "model": "deepseek/deepseek-r1",
+            "messages": [{"role": "user", "content": system_prompt}],
+            "max_tokens": 512,
+            "temperature": 0.7
+        }
+        r = requests.post("https://openrouter.ai/api/v1/chat/completions",
+                          headers=headers, data=json.dumps(data))
+        r.raise_for_status()
+        return r.json()["choices"][0]["message"]["content"].strip()
+    except Exception as e:
+        print("[ERROR] QA call failed:", e)
+        fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering now."}
+        return json.dumps(fallback)
+###############################################################################
+# Bing multi-search
+###############################################################################
+def fetch_bing_results(query: str, count: int = 12) -> list:
+    """
+    We also fix possible newlines in the key.
+    """
+    bing_api_key = os.environ.get("BING_API_KEY")
+    if bing_api_key:
+        bing_api_key = bing_api_key.strip()  # remove trailing newline if any
+    else:
+        return []
+    print("[LOG] Attempting Bing Web Search for:", query)
+    url = "https://api.bing.microsoft.com/v7.0/search"
+    headers = {"Ocp-Apim-Subscription-Key": bing_api_key}
+    params = {"q": query, "count": count}
+    try:
+        resp = requests.get(url, headers=headers, params=params, timeout=15)
+        if resp.status_code != 200:
+            print("[ERROR] Bing search code:", resp.status_code)
+            print("[DEBUG] Bing search body:", resp.text)
+            resp.raise_for_status()
+        data = resp.json()
+        web_pages = data.get("webPages", {}).get("value", [])
+        results = []
+        for wp in web_pages:
+            results.append({
+                "title": wp.get("name", ""),
+                "link": wp.get("url", ""),
+                "snippet": wp.get("snippet", "")
+            })
+        return results
+    except Exception as e:
+        print("[ERROR] Bing search failed:", e)
+        return []
+###############################################################################
+# Combine all cleaned sources
+###############################################################################
+def _gather_cleaned_sources(topic: str, sources_list: list) -> str:
+    combined_body = []
+    for s in sources_list:
+        snippet = (
+            f"**Title**: {s['title']}\n"
+            f"**Link**: {s['link']}\n\n"
+            f"{s['cleaned_text']}\n\n"
+        )
+        combined_body.append(snippet)
+    return "\n".join(combined_body)
+###############################################################################
+# Reorganize final text into multi-section "Professional Research Report"
+# with dynamic headings
+###############################################################################
+def rewrite_into_pro_outline(topic: str, combined_body: str) -> str:
+    prompt = f"""
+You are a professional research writer. Please produce a final research report with this structure:
+# Professional Research Report
+## {{Topic}}
+### Executive Summary
+(Write a concise summary of the key insights.)
+Then automatically determine 3-5 relevant section headings for this content, giving each a descriptive title.
+Follow with a 'Conclusion' section.
+Finally add 'References & Footnotes', then '(End of Professional Report)'.
+Adapt headings to the actual content. Avoid headings that don't make sense for the user topic.
+Topic: {topic}
+Raw Combined Text:
+{combined_body}
+"""
+    groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+    try:
+        resp = groq_client.chat.completions.create(
+            messages=[
+                {"role": "system", "content": "You are a professional research writer."},
+                {"role": "user", "content": prompt}
+            ],
+            model="llama-3.3-70b-versatile",
+            max_tokens=4096,
+            temperature=0.7
+        )
+        return resp.choices[0].message.content.strip()
+    except Exception as e:
+        print("[ERROR] rewriting into pro outline failed:", e)
+        return combined_body
+###############################################################################
+# The main function that queries Google & Bing
+###############################################################################
+def perform_deep_research(topic: str) -> str:
+    # HF SPACES UPGRADE: remove trailing newlines from env secrets
+    google_cse_id = os.environ.get("GOOGLE_CSE_ID")
+    google_api_key = os.environ.get("GOOGLE_API_KEY")
+    if google_cse_id:
+        google_cse_id = google_cse_id.strip()
+    if google_api_key:
+        google_api_key = google_api_key.strip()
+    all_sources = []
+    # GOOGLE
+    google_results = []
+    if google_cse_id and google_api_key:
+        try:
+            print("[LOG] Attempting Google CSE for:", topic)
+            url = "https://customsearch.googleapis.com/customsearch/v1"
+            params = {
+                "q": topic,
+                "cx": google_cse_id,
+                "key": google_api_key,
+                "num": 12
+            }
+            # We'll do an explicit check/log
+            resp = requests.get(url, params=params, timeout=15)
+            if resp.status_code != 200:
+                print("[ERROR] Google CSE status code:", resp.status_code)
+                print("[DEBUG] Google CSE response body:", resp.text)
+                resp.raise_for_status()
+            data = resp.json()
+            items = data.get("items", [])
+            for it in items:
+                google_results.append({
+                    "title": it.get("title", ""),
+                    "link": it.get("link", ""),
+                    "snippet": it.get("snippet", "")
+                })
+        except requests.HTTPError as e:
+            print("[ERROR] Google approach failed (HTTPError):", e)
+        except Exception as e:
+            print("[ERROR] Google approach failed (other error):", e)
+    # BING
+    bing_results = fetch_bing_results(topic, count=12)
+    combined_raw = google_results + bing_results
+    if not combined_raw:
+        print("[LOG] No direct search results, fallback to older approach.")
+        fallback_info = research_topic(topic)
+        cleaned_fb = rewrite_in_professional_style(topic, fallback_info)
+        all_sources = [{
+            "index": 1,
+            "title": "Fallback Info",
+            "link": "N/A",
+            "cleaned_text": cleaned_fb
+        }]
+    else:
+        idx = 0
+        for res in combined_raw:
+            idx += 1
+            link = res["link"]
+            snippet = res["snippet"] or ""
+            title = res["title"] or ""
+            article_text = fetch_article_text(link)
+            if not article_text.strip():
+                article_text = snippet
+            cleaned = rewrite_in_professional_style(topic, article_text)
+            if cleaned.strip():
+                item = {
+                    "index": idx,
+                    "title": title,
+                    "link": link,
+                    "cleaned_text": cleaned
+                }
+                all_sources.append(item)
+        if not all_sources:
+            print("[LOG] None found after rewriting, fallback anyway.")
+            fb_info = research_topic(topic)
+            cleaned_fb = rewrite_in_professional_style(topic, fb_info)
+            all_sources = [{
+                "index": 1,
+                "title": "Fallback Info",
+                "link": "N/A",
+                "cleaned_text": cleaned_fb
+            }]
+    combined_body = _gather_cleaned_sources(topic, all_sources)
+    final_report = rewrite_into_pro_outline(topic, combined_body)
+    return final_report
+###############################################################################
+# Chart Generation -> base64 PNG
+###############################################################################
+def generate_simple_chart(data_list: list) -> str:
+    if not data_list:
+        return ""
+    import pandas as pd
+    df = pd.DataFrame({"value": data_list, "index": range(len(data_list))})
+    chart = alt.Chart(df).mark_bar().encode(
+        x="index:O",
+        y="value:Q"
+    ).properties(title="Sample Chart")
+    try:
+        png_bytes = altair_saver.save(chart, fp=None, fmt="png")
+        b64_img = base64.b64encode(png_bytes).decode("utf-8")
+        return f"![Chart](data:image/png;base64,{b64_img})"
+    except Exception as e:
+        print("[ERROR] Chart export error:", e)
+        return "*(Chart could not be generated in PNG form.)*"
+###############################################################################
+# Markdown -> PDF
+###############################################################################
+def generate_pdf_from_markdown(md_content: str) -> bytes:
+    import markdown
+    html_content = markdown.markdown(md_content, extensions=["extra", "tables", "toc"])
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as tmp_html:
+        tmp_html.write(html_content.encode("utf-8"))
+        tmp_html_path = tmp_html.name
+    tmp_pdf_path = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name
+    try:
+        pdfkit.from_file(tmp_html_path, tmp_pdf_path)
+        with open(tmp_pdf_path, "rb") as f:
+            pdf_bytes = f.read()
+    finally:
+        if os.path.exists(tmp_html_path):
+            os.remove(tmp_html_path)
+        if os.path.exists(tmp_pdf_path):
+            os.remove(tmp_pdf_path)
+    return pdf_bytes