SearchPod1.0

Running

App Files Files Community

siddhartharyaai commited on Jan 30

Commit

13936d2

verified ·

1 Parent(s): 345a230

Update utils.py

Browse files

Files changed (1) hide show

utils.py +72 -24

utils.py CHANGED Viewed

@@ -277,7 +277,6 @@ def generate_script(
     import streamlit as st
     print("[LOG] Generating script with tone:", tone, "and length:", target_length)
-    # pick host/guest names based on language
     if (host_name == "Jane" or not host_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
         host_name = "Isha"
     if (guest_name == "John" or not guest_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
@@ -498,15 +497,68 @@ def fetch_bing_results(query: str, count: int = 10) -> list:
         return []
 ###############################################################################
-# Unified aggregator: google + bing + wiki + rss + (optionally LLM)
 ###############################################################################
 def perform_deep_research(topic: str) -> str:
     """
-    1) Google (up to 10) if creds.
-    2) Bing (up to 10) if SERP_API_KEY.
     3) Wikipedia summary
     4) RSS approach
-    5) If still nothing, fallback to older approach (which also calls LLM).
     """
     # Step 1: Google
     google_cse_id = os.environ.get("GOOGLE_CSE_ID")
@@ -549,9 +601,7 @@ def perform_deep_research(topic: str) -> str:
         }
     # Step 4: RSS approach
-    # We'll basically do the "research_topic" approach, but let's do just the raw RSS + not LLM fallback
-    # so we don't do double LLM. We'll gather any RSS items if found
-    rss_sources = []
     sources_dict = {
         "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
         "CNN": "http://rss.cnn.com/rss/edition.rss",
@@ -561,6 +611,7 @@ def perform_deep_research(topic: str) -> str:
         "The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
         "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
     }
     for name, feed_url in sources_dict.items():
         try:
             items = fetch_rss_feed(feed_url)
@@ -585,22 +636,21 @@ def perform_deep_research(topic: str) -> str:
             print(f"[ERROR] Error fetching from {name} RSS feed:", e)
             continue
     # Combine everything
     combined = []
-    # google
     combined.extend(google_sources)
-    # bing
     combined.extend(bing_results)
-    # wiki
     if wiki_item:
         combined.append(wiki_item)
-    # rss
     combined.extend(rss_sources)
     if not combined:
-        print("[LOG] No results found from aggregator. Falling back to older approach w/ LLM.")
-        # The older approach also might call LLM if needed
-        old_fallback = research_topic(topic)  # calls wiki + RSS + LLM fallback
         cleaned_fb = rewrite_in_professional_style(topic, old_fallback)
         fallback_data = [{
             "index": 1,
@@ -618,15 +668,7 @@ def perform_deep_research(topic: str) -> str:
             link = source.get("link", "")
             snippet = source.get("snippet", "")
             title = source.get("title", "")
-            # attempt to fetch link text if snippet is empty (but we already have snippet from google/bing)
-            # We'll consider snippet enough, or we can do fetch_article_text if you want deeper text:
-            # but let's keep snippet for performance
-            # If you prefer to fetch the link, uncomment:
-            """
-            if link and not snippet.strip():
-                link_text = fetch_article_text(link)
-                snippet = link_text or snippet
-            """
             cleaned_text = rewrite_in_professional_style(topic, snippet)
             if cleaned_text.strip():
                 final_list.append({
@@ -731,6 +773,12 @@ def generate_simple_chart(data_list: list) -> str:
         return "*(Chart could not be generated in PNG form.)*"
 def generate_pdf_from_markdown(md_content: str) -> bytes:
     import markdown
     html_content = markdown.markdown(md_content, extensions=["extra", "tables", "toc"])

     import streamlit as st
     print("[LOG] Generating script with tone:", tone, "and length:", target_length)
     if (host_name == "Jane" or not host_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
         host_name = "Isha"
     if (guest_name == "John" or not guest_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
         return []
 ###############################################################################
+# Event Registry (News API) aggregator
+###############################################################################
+def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
+    """
+    Query https://eventregistry.org/api/v1/article/getArticles
+    with the env var NEWS_API_KEY, searching for 'topic'.
+    Return list of {title, link, snippet}.
+    """
+    news_api_key = os.environ.get("NEWS_API_KEY")
+    if not news_api_key:
+        print("[ERROR] Missing NEWS_API_KEY for Event Registry.")
+        return []
+    print("[LOG] Attempting Event Registry for topic:", topic)
+    endpoint = "https://eventregistry.org/api/v1/article/getArticles"
+    # We'll do 1 page, up to 'count' articles
+    # This is a minimal example request body
+    body = {
+        "action": "getArticles",
+        "keyword": topic,
+        "articlesPage": 1,
+        "articlesCount": count,  # up to 100, we do count=10 for uniformity
+        "articlesSortBy": "date",
+        "articlesSortByAsc": False,
+        "dataType": ["news", "pr"],
+        "forceMaxDataTimeWindow": 31,  # last month
+        "resultType": "articles",
+        "apiKey": news_api_key
+    }
+    try:
+        resp = requests.post(endpoint, json=body, timeout=20)
+        resp.raise_for_status()
+        data = resp.json()
+        # According to docs, articles can be found at data["articles"]["results"]
+        art_data = data.get("articles", {})
+        results_arr = art_data.get("results", [])
+        ret = []
+        for item in results_arr:
+            # item might have "title", "url", "body" or "titleUri"
+            title = item.get("title", "")
+            url = item.get("url", "")
+            # we can pick either "body" or "excerpt"
+            snippet = item.get("body", "") or item.get("excerpt", "")
+            ret.append({"title": title, "link": url, "snippet": snippet})
+        return ret
+    except Exception as e:
+        print("[ERROR] Event Registry approach failed:", e)
+        return []
+###############################################################################
+# Unified aggregator: google + bing + wiki + rss + event registry + fallback
 ###############################################################################
 def perform_deep_research(topic: str) -> str:
     """
+    1) Google (up to 10) if creds
+    2) Bing (up to 10) if SERP_API_KEY
     3) Wikipedia summary
     4) RSS approach
+    5) Event Registry (news api) if NEWS_API_KEY
+    6) If still nothing, fallback to older approach (which also calls LLM)
     """
     # Step 1: Google
     google_cse_id = os.environ.get("GOOGLE_CSE_ID")
         }
     # Step 4: RSS approach
+    # We'll basically do the "research_topic" approach but only the RSS portion
     sources_dict = {
         "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
         "CNN": "http://rss.cnn.com/rss/edition.rss",
         "The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
         "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
     }
+    rss_sources = []
     for name, feed_url in sources_dict.items():
         try:
             items = fetch_rss_feed(feed_url)
             print(f"[ERROR] Error fetching from {name} RSS feed:", e)
             continue
+    # Step 5: Event Registry
+    event_registry_res = fetch_eventregistry_articles(topic, count=10)
     # Combine everything
     combined = []
     combined.extend(google_sources)
     combined.extend(bing_results)
     if wiki_item:
         combined.append(wiki_item)
     combined.extend(rss_sources)
+    combined.extend(event_registry_res)
     if not combined:
+        print("[LOG] No results found from aggregator. Falling back to older method w/ LLM.")
+        old_fallback = research_topic(topic)
         cleaned_fb = rewrite_in_professional_style(topic, old_fallback)
         fallback_data = [{
             "index": 1,
             link = source.get("link", "")
             snippet = source.get("snippet", "")
             title = source.get("title", "")
             cleaned_text = rewrite_in_professional_style(topic, snippet)
             if cleaned_text.strip():
                 final_list.append({
         return "*(Chart could not be generated in PNG form.)*"
 def generate_pdf_from_markdown(md_content: str) -> bytes:
+    """
+    Convert Markdown to PDF using pdfkit (needs `markdown` library installed).
+    If you see a 'ModuleNotFoundError: No module named 'markdown'', you must install it:
+      pip install markdown
+    Also ensure wkhtmltopdf is installed on system for pdfkit usage.
+    """
     import markdown
     html_content = markdown.markdown(md_content, extensions=["extra", "tables", "toc"])