Spaces:
Running
Running
Update utils.py
Browse files
utils.py
CHANGED
@@ -277,7 +277,6 @@ def generate_script(
|
|
277 |
import streamlit as st
|
278 |
print("[LOG] Generating script with tone:", tone, "and length:", target_length)
|
279 |
|
280 |
-
# pick host/guest names based on language
|
281 |
if (host_name == "Jane" or not host_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
|
282 |
host_name = "Isha"
|
283 |
if (guest_name == "John" or not guest_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
|
@@ -498,15 +497,68 @@ def fetch_bing_results(query: str, count: int = 10) -> list:
|
|
498 |
return []
|
499 |
|
500 |
###############################################################################
|
501 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
502 |
###############################################################################
|
503 |
def perform_deep_research(topic: str) -> str:
|
504 |
"""
|
505 |
-
1) Google (up to 10) if creds
|
506 |
-
2) Bing (up to 10) if SERP_API_KEY
|
507 |
3) Wikipedia summary
|
508 |
4) RSS approach
|
509 |
-
5)
|
|
|
510 |
"""
|
511 |
# Step 1: Google
|
512 |
google_cse_id = os.environ.get("GOOGLE_CSE_ID")
|
@@ -549,9 +601,7 @@ def perform_deep_research(topic: str) -> str:
|
|
549 |
}
|
550 |
|
551 |
# Step 4: RSS approach
|
552 |
-
# We'll basically do the "research_topic" approach
|
553 |
-
# so we don't do double LLM. We'll gather any RSS items if found
|
554 |
-
rss_sources = []
|
555 |
sources_dict = {
|
556 |
"BBC": "https://feeds.bbci.co.uk/news/rss.xml",
|
557 |
"CNN": "http://rss.cnn.com/rss/edition.rss",
|
@@ -561,6 +611,7 @@ def perform_deep_research(topic: str) -> str:
|
|
561 |
"The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
|
562 |
"Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
|
563 |
}
|
|
|
564 |
for name, feed_url in sources_dict.items():
|
565 |
try:
|
566 |
items = fetch_rss_feed(feed_url)
|
@@ -585,22 +636,21 @@ def perform_deep_research(topic: str) -> str:
|
|
585 |
print(f"[ERROR] Error fetching from {name} RSS feed:", e)
|
586 |
continue
|
587 |
|
|
|
|
|
|
|
588 |
# Combine everything
|
589 |
combined = []
|
590 |
-
# google
|
591 |
combined.extend(google_sources)
|
592 |
-
# bing
|
593 |
combined.extend(bing_results)
|
594 |
-
# wiki
|
595 |
if wiki_item:
|
596 |
combined.append(wiki_item)
|
597 |
-
# rss
|
598 |
combined.extend(rss_sources)
|
|
|
599 |
|
600 |
if not combined:
|
601 |
-
print("[LOG] No results found from aggregator. Falling back to older
|
602 |
-
|
603 |
-
old_fallback = research_topic(topic) # calls wiki + RSS + LLM fallback
|
604 |
cleaned_fb = rewrite_in_professional_style(topic, old_fallback)
|
605 |
fallback_data = [{
|
606 |
"index": 1,
|
@@ -618,15 +668,7 @@ def perform_deep_research(topic: str) -> str:
|
|
618 |
link = source.get("link", "")
|
619 |
snippet = source.get("snippet", "")
|
620 |
title = source.get("title", "")
|
621 |
-
|
622 |
-
# We'll consider snippet enough, or we can do fetch_article_text if you want deeper text:
|
623 |
-
# but let's keep snippet for performance
|
624 |
-
# If you prefer to fetch the link, uncomment:
|
625 |
-
"""
|
626 |
-
if link and not snippet.strip():
|
627 |
-
link_text = fetch_article_text(link)
|
628 |
-
snippet = link_text or snippet
|
629 |
-
"""
|
630 |
cleaned_text = rewrite_in_professional_style(topic, snippet)
|
631 |
if cleaned_text.strip():
|
632 |
final_list.append({
|
@@ -731,6 +773,12 @@ def generate_simple_chart(data_list: list) -> str:
|
|
731 |
return "*(Chart could not be generated in PNG form.)*"
|
732 |
|
733 |
def generate_pdf_from_markdown(md_content: str) -> bytes:
|
|
|
|
|
|
|
|
|
|
|
|
|
734 |
import markdown
|
735 |
html_content = markdown.markdown(md_content, extensions=["extra", "tables", "toc"])
|
736 |
|
|
|
277 |
import streamlit as st
|
278 |
print("[LOG] Generating script with tone:", tone, "and length:", target_length)
|
279 |
|
|
|
280 |
if (host_name == "Jane" or not host_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
|
281 |
host_name = "Isha"
|
282 |
if (guest_name == "John" or not guest_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
|
|
|
497 |
return []
|
498 |
|
499 |
###############################################################################
|
500 |
+
# Event Registry (News API) aggregator
|
501 |
+
###############################################################################
|
502 |
+
def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
|
503 |
+
"""
|
504 |
+
Query https://eventregistry.org/api/v1/article/getArticles
|
505 |
+
with the env var NEWS_API_KEY, searching for 'topic'.
|
506 |
+
Return list of {title, link, snippet}.
|
507 |
+
"""
|
508 |
+
news_api_key = os.environ.get("NEWS_API_KEY")
|
509 |
+
if not news_api_key:
|
510 |
+
print("[ERROR] Missing NEWS_API_KEY for Event Registry.")
|
511 |
+
return []
|
512 |
+
|
513 |
+
print("[LOG] Attempting Event Registry for topic:", topic)
|
514 |
+
endpoint = "https://eventregistry.org/api/v1/article/getArticles"
|
515 |
+
# We'll do 1 page, up to 'count' articles
|
516 |
+
# This is a minimal example request body
|
517 |
+
body = {
|
518 |
+
"action": "getArticles",
|
519 |
+
"keyword": topic,
|
520 |
+
"articlesPage": 1,
|
521 |
+
"articlesCount": count, # up to 100, we do count=10 for uniformity
|
522 |
+
"articlesSortBy": "date",
|
523 |
+
"articlesSortByAsc": False,
|
524 |
+
"dataType": ["news", "pr"],
|
525 |
+
"forceMaxDataTimeWindow": 31, # last month
|
526 |
+
"resultType": "articles",
|
527 |
+
"apiKey": news_api_key
|
528 |
+
}
|
529 |
+
|
530 |
+
try:
|
531 |
+
resp = requests.post(endpoint, json=body, timeout=20)
|
532 |
+
resp.raise_for_status()
|
533 |
+
data = resp.json()
|
534 |
+
# According to docs, articles can be found at data["articles"]["results"]
|
535 |
+
art_data = data.get("articles", {})
|
536 |
+
results_arr = art_data.get("results", [])
|
537 |
+
|
538 |
+
ret = []
|
539 |
+
for item in results_arr:
|
540 |
+
# item might have "title", "url", "body" or "titleUri"
|
541 |
+
title = item.get("title", "")
|
542 |
+
url = item.get("url", "")
|
543 |
+
# we can pick either "body" or "excerpt"
|
544 |
+
snippet = item.get("body", "") or item.get("excerpt", "")
|
545 |
+
ret.append({"title": title, "link": url, "snippet": snippet})
|
546 |
+
return ret
|
547 |
+
except Exception as e:
|
548 |
+
print("[ERROR] Event Registry approach failed:", e)
|
549 |
+
return []
|
550 |
+
|
551 |
+
###############################################################################
|
552 |
+
# Unified aggregator: google + bing + wiki + rss + event registry + fallback
|
553 |
###############################################################################
|
554 |
def perform_deep_research(topic: str) -> str:
|
555 |
"""
|
556 |
+
1) Google (up to 10) if creds
|
557 |
+
2) Bing (up to 10) if SERP_API_KEY
|
558 |
3) Wikipedia summary
|
559 |
4) RSS approach
|
560 |
+
5) Event Registry (news api) if NEWS_API_KEY
|
561 |
+
6) If still nothing, fallback to older approach (which also calls LLM)
|
562 |
"""
|
563 |
# Step 1: Google
|
564 |
google_cse_id = os.environ.get("GOOGLE_CSE_ID")
|
|
|
601 |
}
|
602 |
|
603 |
# Step 4: RSS approach
|
604 |
+
# We'll basically do the "research_topic" approach but only the RSS portion
|
|
|
|
|
605 |
sources_dict = {
|
606 |
"BBC": "https://feeds.bbci.co.uk/news/rss.xml",
|
607 |
"CNN": "http://rss.cnn.com/rss/edition.rss",
|
|
|
611 |
"The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
|
612 |
"Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
|
613 |
}
|
614 |
+
rss_sources = []
|
615 |
for name, feed_url in sources_dict.items():
|
616 |
try:
|
617 |
items = fetch_rss_feed(feed_url)
|
|
|
636 |
print(f"[ERROR] Error fetching from {name} RSS feed:", e)
|
637 |
continue
|
638 |
|
639 |
+
# Step 5: Event Registry
|
640 |
+
event_registry_res = fetch_eventregistry_articles(topic, count=10)
|
641 |
+
|
642 |
# Combine everything
|
643 |
combined = []
|
|
|
644 |
combined.extend(google_sources)
|
|
|
645 |
combined.extend(bing_results)
|
|
|
646 |
if wiki_item:
|
647 |
combined.append(wiki_item)
|
|
|
648 |
combined.extend(rss_sources)
|
649 |
+
combined.extend(event_registry_res)
|
650 |
|
651 |
if not combined:
|
652 |
+
print("[LOG] No results found from aggregator. Falling back to older method w/ LLM.")
|
653 |
+
old_fallback = research_topic(topic)
|
|
|
654 |
cleaned_fb = rewrite_in_professional_style(topic, old_fallback)
|
655 |
fallback_data = [{
|
656 |
"index": 1,
|
|
|
668 |
link = source.get("link", "")
|
669 |
snippet = source.get("snippet", "")
|
670 |
title = source.get("title", "")
|
671 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
672 |
cleaned_text = rewrite_in_professional_style(topic, snippet)
|
673 |
if cleaned_text.strip():
|
674 |
final_list.append({
|
|
|
773 |
return "*(Chart could not be generated in PNG form.)*"
|
774 |
|
775 |
def generate_pdf_from_markdown(md_content: str) -> bytes:
|
776 |
+
"""
|
777 |
+
Convert Markdown to PDF using pdfkit (needs `markdown` library installed).
|
778 |
+
If you see a 'ModuleNotFoundError: No module named 'markdown'', you must install it:
|
779 |
+
pip install markdown
|
780 |
+
Also ensure wkhtmltopdf is installed on system for pdfkit usage.
|
781 |
+
"""
|
782 |
import markdown
|
783 |
html_content = markdown.markdown(md_content, extensions=["extra", "tables", "toc"])
|
784 |
|