siddhartharyaai commited on
Commit
13936d2
·
verified ·
1 Parent(s): 345a230

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +72 -24
utils.py CHANGED
@@ -277,7 +277,6 @@ def generate_script(
277
  import streamlit as st
278
  print("[LOG] Generating script with tone:", tone, "and length:", target_length)
279
 
280
- # pick host/guest names based on language
281
  if (host_name == "Jane" or not host_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
282
  host_name = "Isha"
283
  if (guest_name == "John" or not guest_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
@@ -498,15 +497,68 @@ def fetch_bing_results(query: str, count: int = 10) -> list:
498
  return []
499
 
500
  ###############################################################################
501
- # Unified aggregator: google + bing + wiki + rss + (optionally LLM)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
502
  ###############################################################################
503
  def perform_deep_research(topic: str) -> str:
504
  """
505
- 1) Google (up to 10) if creds.
506
- 2) Bing (up to 10) if SERP_API_KEY.
507
  3) Wikipedia summary
508
  4) RSS approach
509
- 5) If still nothing, fallback to older approach (which also calls LLM).
 
510
  """
511
  # Step 1: Google
512
  google_cse_id = os.environ.get("GOOGLE_CSE_ID")
@@ -549,9 +601,7 @@ def perform_deep_research(topic: str) -> str:
549
  }
550
 
551
  # Step 4: RSS approach
552
- # We'll basically do the "research_topic" approach, but let's do just the raw RSS + not LLM fallback
553
- # so we don't do double LLM. We'll gather any RSS items if found
554
- rss_sources = []
555
  sources_dict = {
556
  "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
557
  "CNN": "http://rss.cnn.com/rss/edition.rss",
@@ -561,6 +611,7 @@ def perform_deep_research(topic: str) -> str:
561
  "The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
562
  "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
563
  }
 
564
  for name, feed_url in sources_dict.items():
565
  try:
566
  items = fetch_rss_feed(feed_url)
@@ -585,22 +636,21 @@ def perform_deep_research(topic: str) -> str:
585
  print(f"[ERROR] Error fetching from {name} RSS feed:", e)
586
  continue
587
 
 
 
 
588
  # Combine everything
589
  combined = []
590
- # google
591
  combined.extend(google_sources)
592
- # bing
593
  combined.extend(bing_results)
594
- # wiki
595
  if wiki_item:
596
  combined.append(wiki_item)
597
- # rss
598
  combined.extend(rss_sources)
 
599
 
600
  if not combined:
601
- print("[LOG] No results found from aggregator. Falling back to older approach w/ LLM.")
602
- # The older approach also might call LLM if needed
603
- old_fallback = research_topic(topic) # calls wiki + RSS + LLM fallback
604
  cleaned_fb = rewrite_in_professional_style(topic, old_fallback)
605
  fallback_data = [{
606
  "index": 1,
@@ -618,15 +668,7 @@ def perform_deep_research(topic: str) -> str:
618
  link = source.get("link", "")
619
  snippet = source.get("snippet", "")
620
  title = source.get("title", "")
621
- # attempt to fetch link text if snippet is empty (but we already have snippet from google/bing)
622
- # We'll consider snippet enough, or we can do fetch_article_text if you want deeper text:
623
- # but let's keep snippet for performance
624
- # If you prefer to fetch the link, uncomment:
625
- """
626
- if link and not snippet.strip():
627
- link_text = fetch_article_text(link)
628
- snippet = link_text or snippet
629
- """
630
  cleaned_text = rewrite_in_professional_style(topic, snippet)
631
  if cleaned_text.strip():
632
  final_list.append({
@@ -731,6 +773,12 @@ def generate_simple_chart(data_list: list) -> str:
731
  return "*(Chart could not be generated in PNG form.)*"
732
 
733
  def generate_pdf_from_markdown(md_content: str) -> bytes:
 
 
 
 
 
 
734
  import markdown
735
  html_content = markdown.markdown(md_content, extensions=["extra", "tables", "toc"])
736
 
 
277
  import streamlit as st
278
  print("[LOG] Generating script with tone:", tone, "and length:", target_length)
279
 
 
280
  if (host_name == "Jane" or not host_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
281
  host_name = "Isha"
282
  if (guest_name == "John" or not guest_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
 
497
  return []
498
 
499
  ###############################################################################
500
+ # Event Registry (News API) aggregator
501
+ ###############################################################################
502
+ def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
503
+ """
504
+ Query https://eventregistry.org/api/v1/article/getArticles
505
+ with the env var NEWS_API_KEY, searching for 'topic'.
506
+ Return list of {title, link, snippet}.
507
+ """
508
+ news_api_key = os.environ.get("NEWS_API_KEY")
509
+ if not news_api_key:
510
+ print("[ERROR] Missing NEWS_API_KEY for Event Registry.")
511
+ return []
512
+
513
+ print("[LOG] Attempting Event Registry for topic:", topic)
514
+ endpoint = "https://eventregistry.org/api/v1/article/getArticles"
515
+ # We'll do 1 page, up to 'count' articles
516
+ # This is a minimal example request body
517
+ body = {
518
+ "action": "getArticles",
519
+ "keyword": topic,
520
+ "articlesPage": 1,
521
+ "articlesCount": count, # up to 100, we do count=10 for uniformity
522
+ "articlesSortBy": "date",
523
+ "articlesSortByAsc": False,
524
+ "dataType": ["news", "pr"],
525
+ "forceMaxDataTimeWindow": 31, # last month
526
+ "resultType": "articles",
527
+ "apiKey": news_api_key
528
+ }
529
+
530
+ try:
531
+ resp = requests.post(endpoint, json=body, timeout=20)
532
+ resp.raise_for_status()
533
+ data = resp.json()
534
+ # According to docs, articles can be found at data["articles"]["results"]
535
+ art_data = data.get("articles", {})
536
+ results_arr = art_data.get("results", [])
537
+
538
+ ret = []
539
+ for item in results_arr:
540
+ # item might have "title", "url", "body" or "titleUri"
541
+ title = item.get("title", "")
542
+ url = item.get("url", "")
543
+ # we can pick either "body" or "excerpt"
544
+ snippet = item.get("body", "") or item.get("excerpt", "")
545
+ ret.append({"title": title, "link": url, "snippet": snippet})
546
+ return ret
547
+ except Exception as e:
548
+ print("[ERROR] Event Registry approach failed:", e)
549
+ return []
550
+
551
+ ###############################################################################
552
+ # Unified aggregator: google + bing + wiki + rss + event registry + fallback
553
  ###############################################################################
554
  def perform_deep_research(topic: str) -> str:
555
  """
556
+ 1) Google (up to 10) if creds
557
+ 2) Bing (up to 10) if SERP_API_KEY
558
  3) Wikipedia summary
559
  4) RSS approach
560
+ 5) Event Registry (news api) if NEWS_API_KEY
561
+ 6) If still nothing, fallback to older approach (which also calls LLM)
562
  """
563
  # Step 1: Google
564
  google_cse_id = os.environ.get("GOOGLE_CSE_ID")
 
601
  }
602
 
603
  # Step 4: RSS approach
604
+ # We'll basically do the "research_topic" approach but only the RSS portion
 
 
605
  sources_dict = {
606
  "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
607
  "CNN": "http://rss.cnn.com/rss/edition.rss",
 
611
  "The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
612
  "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
613
  }
614
+ rss_sources = []
615
  for name, feed_url in sources_dict.items():
616
  try:
617
  items = fetch_rss_feed(feed_url)
 
636
  print(f"[ERROR] Error fetching from {name} RSS feed:", e)
637
  continue
638
 
639
+ # Step 5: Event Registry
640
+ event_registry_res = fetch_eventregistry_articles(topic, count=10)
641
+
642
  # Combine everything
643
  combined = []
 
644
  combined.extend(google_sources)
 
645
  combined.extend(bing_results)
 
646
  if wiki_item:
647
  combined.append(wiki_item)
 
648
  combined.extend(rss_sources)
649
+ combined.extend(event_registry_res)
650
 
651
  if not combined:
652
+ print("[LOG] No results found from aggregator. Falling back to older method w/ LLM.")
653
+ old_fallback = research_topic(topic)
 
654
  cleaned_fb = rewrite_in_professional_style(topic, old_fallback)
655
  fallback_data = [{
656
  "index": 1,
 
668
  link = source.get("link", "")
669
  snippet = source.get("snippet", "")
670
  title = source.get("title", "")
671
+
 
 
 
 
 
 
 
 
672
  cleaned_text = rewrite_in_professional_style(topic, snippet)
673
  if cleaned_text.strip():
674
  final_list.append({
 
773
  return "*(Chart could not be generated in PNG form.)*"
774
 
775
  def generate_pdf_from_markdown(md_content: str) -> bytes:
776
+ """
777
+ Convert Markdown to PDF using pdfkit (needs `markdown` library installed).
778
+ If you see a 'ModuleNotFoundError: No module named 'markdown'', you must install it:
779
+ pip install markdown
780
+ Also ensure wkhtmltopdf is installed on system for pdfkit usage.
781
+ """
782
  import markdown
783
  html_content = markdown.markdown(md_content, extensions=["extra", "tables", "toc"])
784