Spaces:
Running
Running
Update utils.py
Browse files
utils.py
CHANGED
@@ -1,50 +1,43 @@
|
|
1 |
import os
|
2 |
-
import re
|
3 |
import json
|
|
|
4 |
import requests
|
5 |
import tempfile
|
6 |
-
from
|
7 |
-
from typing import List, Literal
|
8 |
from pydantic import BaseModel
|
|
|
9 |
from pydub import AudioSegment, effects
|
10 |
-
from transformers import pipeline
|
11 |
-
import yt_dlp
|
12 |
import tiktoken
|
13 |
-
import numpy as np
|
14 |
-
import torch
|
15 |
-
import random
|
16 |
|
17 |
-
|
18 |
-
from io import BytesIO
|
19 |
-
import pdfkit
|
20 |
-
import markdown # Added for Markdown to HTML conversion
|
21 |
-
|
22 |
-
# Define Dialogue Models
|
23 |
class DialogueItem(BaseModel):
|
24 |
-
speaker:
|
25 |
-
display_speaker: str
|
26 |
text: str
|
27 |
|
28 |
class Dialogue(BaseModel):
|
29 |
dialogue: List[DialogueItem]
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
)
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
48 |
print("[LOG] Extracting text from URL:", url)
|
49 |
try:
|
50 |
headers = {
|
@@ -68,51 +61,79 @@ def extract_text_from_url(url):
|
|
68 |
print(f"[ERROR] Exception during text extraction from URL: {e}")
|
69 |
return ""
|
70 |
|
71 |
-
def
|
72 |
-
print(
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
return ""
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
|
|
|
|
|
|
|
|
98 |
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
|
|
|
|
|
|
|
|
|
|
101 |
try:
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
)
|
108 |
-
|
|
|
|
|
109 |
except Exception as e:
|
110 |
-
print("[ERROR]
|
111 |
-
return
|
112 |
|
113 |
-
###############################################################################
|
114 |
-
# Event Registry (News API) aggregator
|
115 |
-
###############################################################################
|
116 |
def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
|
117 |
"""
|
118 |
Query https://eventregistry.org/api/v1/article/getArticles
|
@@ -161,9 +182,6 @@ def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
|
|
161 |
print("[ERROR] Event Registry approach failed:", e)
|
162 |
return []
|
163 |
|
164 |
-
###############################################################################
|
165 |
-
# Bing via SerpApi
|
166 |
-
###############################################################################
|
167 |
def fetch_bing_results(query: str, count: int = 10) -> list:
|
168 |
serp_api_key = os.environ.get("SERP_API_KEY")
|
169 |
if not serp_api_key:
|
@@ -193,18 +211,103 @@ def fetch_bing_results(query: str, count: int = 10) -> list:
|
|
193 |
print("[ERROR] Bing SerpApi approach failed:", e)
|
194 |
return []
|
195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
###############################################################################
|
197 |
# Unified aggregator: google + bing + wiki + rss + event registry + fallback
|
198 |
###############################################################################
|
|
|
199 |
def perform_deep_research(topic: str) -> str:
|
200 |
"""
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
|
|
|
|
|
|
207 |
"""
|
|
|
|
|
|
|
208 |
# Step 1: Google
|
209 |
google_cse_id = os.environ.get("GOOGLE_CSE_ID")
|
210 |
google_api_key = os.environ.get("GOOGLE_API_KEY")
|
@@ -217,12 +320,12 @@ def perform_deep_research(topic: str) -> str:
|
|
217 |
"q": topic,
|
218 |
"cx": google_cse_id,
|
219 |
"key": google_api_key,
|
220 |
-
"num": 10
|
221 |
}
|
222 |
resp = requests.get(url, params=params, timeout=15)
|
223 |
resp.raise_for_status()
|
224 |
data = resp.json()
|
225 |
-
items = data.get("items", [])
|
226 |
for it in items:
|
227 |
google_sources.append({
|
228 |
"title": it.get("title", ""),
|
@@ -233,7 +336,7 @@ def perform_deep_research(topic: str) -> str:
|
|
233 |
print("[ERROR] Google approach failed:", e)
|
234 |
|
235 |
# Step 2: Bing
|
236 |
-
bing_results = fetch_bing_results(topic, count=10)
|
237 |
|
238 |
# Step 3: Wikipedia summary
|
239 |
wiki_summary_text = fetch_wikipedia_summary(topic)
|
@@ -245,7 +348,8 @@ def perform_deep_research(topic: str) -> str:
|
|
245 |
"snippet": wiki_summary_text
|
246 |
}
|
247 |
|
248 |
-
# Step 4: RSS approach
|
|
|
249 |
sources_dict = {
|
250 |
"BBC": "https://feeds.bbci.co.uk/news/rss.xml",
|
251 |
"CNN": "http://rss.cnn.com/rss/edition.rss",
|
@@ -255,8 +359,7 @@ def perform_deep_research(topic: str) -> str:
|
|
255 |
"The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
|
256 |
"Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
|
257 |
}
|
258 |
-
|
259 |
-
for name, feed_url in sources_dict.items():
|
260 |
try:
|
261 |
items = fetch_rss_feed(feed_url)
|
262 |
if not items:
|
@@ -281,9 +384,9 @@ def perform_deep_research(topic: str) -> str:
|
|
281 |
continue
|
282 |
|
283 |
# Step 5: Event Registry
|
284 |
-
event_registry_res = fetch_eventregistry_articles(topic, count=10)
|
285 |
|
286 |
-
# Combine
|
287 |
combined = []
|
288 |
combined.extend(google_sources)
|
289 |
combined.extend(bing_results)
|
@@ -305,42 +408,26 @@ def perform_deep_research(topic: str) -> str:
|
|
305 |
}]
|
306 |
return _draft_professional_report(topic, fallback_data)
|
307 |
else:
|
308 |
-
#
|
309 |
-
|
310 |
-
idx =
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
"index": idx,
|
321 |
-
"title": title,
|
322 |
-
"link": link,
|
323 |
-
"cleaned_text": cleaned_text
|
324 |
-
})
|
325 |
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
fallback_text = query_llm_for_additional_info(topic, "")
|
330 |
-
cleaned_fb = rewrite_in_professional_style(topic, fallback_text)
|
331 |
-
fallback_data = [{
|
332 |
-
"index": 1,
|
333 |
-
"title": "Fallback Info",
|
334 |
-
"link": "N/A",
|
335 |
-
"cleaned_text": cleaned_fb
|
336 |
-
}]
|
337 |
-
return _draft_professional_report(topic, fallback_data)
|
338 |
-
|
339 |
-
return _draft_professional_report(topic, final_list)
|
340 |
|
341 |
def _draft_professional_report(topic: str, sources_list: list) -> str:
|
342 |
"""
|
343 |
-
Build
|
344 |
- Title
|
345 |
- Executive Summary
|
346 |
- Introduction
|
@@ -349,6 +436,13 @@ def _draft_professional_report(topic: str, sources_list: list) -> str:
|
|
349 |
- Conclusion
|
350 |
- References footnotes
|
351 |
Ensures at least ~1000 words.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
352 |
"""
|
353 |
merged_text = []
|
354 |
footnotes = []
|
@@ -362,32 +456,32 @@ def _draft_professional_report(topic: str, sources_list: list) -> str:
|
|
362 |
merged_text.append(text_block)
|
363 |
all_content = "\n\n".join(merged_text)
|
364 |
|
365 |
-
#
|
366 |
system_prompt = f"""You are a highly skilled professional research analyst.
|
367 |
-
You
|
368 |
-
Your task is to produce a comprehensive and detailed formal research report that includes the following sections:
|
369 |
|
370 |
-
|
371 |
-
|
|
|
372 |
3. **Introduction:** Introduce the topic, its relevance, and the scope of the report.
|
373 |
4. **Main Body:**
|
374 |
-
- **
|
375 |
-
- **
|
376 |
-
- *(Continue as needed
|
377 |
-
- **Analysis:**
|
378 |
-
5. **Conclusion:**
|
379 |
6. **References:** List all sources with numeric footnote markers [^1], [^2], etc.
|
380 |
|
381 |
**Requirements:**
|
382 |
-
- **Length:**
|
383 |
- **Content Quality:**
|
384 |
- Incorporate relevant facts, figures, and statistics.
|
385 |
- Use professional and clear language.
|
386 |
- Ensure each section is well-developed without unnecessary repetition.
|
387 |
-
- **Structure:**
|
388 |
-
- **Formatting:**
|
389 |
|
390 |
-
**
|
391 |
-----------------------------------------------------------------------
|
392 |
{all_content}
|
393 |
-----------------------------------------------------------------------
|
@@ -404,18 +498,16 @@ Your task is to produce a comprehensive and detailed formal research report that
|
|
404 |
# Calculate token counts
|
405 |
max_tokens = 6000 # OpenRouter's token limit
|
406 |
system_prompt_tokens = count_tokens(system_prompt)
|
407 |
-
all_content_tokens = count_tokens(all_content)
|
408 |
-
total_tokens = system_prompt_tokens + all_content_tokens
|
409 |
|
410 |
-
print(f"[DEBUG] Total tokens before optimization: {
|
411 |
|
412 |
-
if
|
413 |
# Calculate allowed tokens for all_content
|
414 |
allowed_tokens_for_content = max_tokens - system_prompt_tokens - 100 # Reserve 100 tokens buffer
|
415 |
if allowed_tokens_for_content <= 0:
|
416 |
print("[ERROR] System prompt alone exceeds the token limit.")
|
417 |
return "The system prompt exceeds the token limit. Please reduce the complexity of your research."
|
418 |
-
|
419 |
# Truncate all_content to fit
|
420 |
tokenizer = tiktoken.get_encoding("cl100k_base")
|
421 |
all_content_tokens_list = tokenizer.encode(all_content)
|
@@ -428,7 +520,7 @@ Your task is to produce a comprehensive and detailed formal research report that
|
|
428 |
response = call_deepseek_api(
|
429 |
system_prompt=system_prompt,
|
430 |
user_prompt="", # No additional user prompt
|
431 |
-
max_tokens=3000, #
|
432 |
temperature=0.7
|
433 |
)
|
434 |
final_report = response.strip()
|
@@ -441,6 +533,10 @@ Your task is to produce a comprehensive and detailed formal research report that
|
|
441 |
print("[ERROR] Could not finalize professional report:", e)
|
442 |
return "An unexpected error occurred. Please try again later."
|
443 |
|
|
|
|
|
|
|
|
|
444 |
def call_deepseek_api(system_prompt: str, user_prompt: str, max_tokens: int, temperature: float) -> str:
|
445 |
"""
|
446 |
Function to call DeepSeek R1 via OpenRouter API.
|
@@ -465,8 +561,17 @@ def call_deepseek_api(system_prompt: str, user_prompt: str, max_tokens: int, tem
|
|
465 |
}
|
466 |
response = requests.post("https://openrouter.ai/api/v1/chat/completions",
|
467 |
headers=headers, data=json.dumps(data))
|
468 |
-
response.
|
469 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
470 |
except requests.exceptions.HTTPError as e:
|
471 |
status_code = e.response.status_code
|
472 |
error_content = e.response.json()
|
@@ -483,108 +588,14 @@ def call_deepseek_api(system_prompt: str, user_prompt: str, max_tokens: int, tem
|
|
483 |
print("[ERROR] Could not communicate with OpenRouter API:", e)
|
484 |
raise ValueError("An unexpected error occurred. Please try again later.")
|
485 |
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
Args:
|
491 |
-
markdown_text (str): The Markdown content to convert.
|
492 |
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
# Convert Markdown to HTML
|
498 |
-
html = markdown.markdown(markdown_text)
|
499 |
-
|
500 |
-
# Generate PDF from HTML
|
501 |
-
pdf_bytes = pdfkit.from_string(html, False) # False to return as bytes
|
502 |
-
|
503 |
-
return pdf_bytes
|
504 |
-
except Exception as e:
|
505 |
-
print(f"[ERROR] Failed to generate PDF from Markdown: {e}")
|
506 |
-
return b""
|
507 |
-
|
508 |
-
def fetch_wikipedia_summary(topic: str) -> str:
|
509 |
-
print("[LOG] Fetching Wikipedia summary for:", topic)
|
510 |
-
try:
|
511 |
-
search_url = (
|
512 |
-
f"https://en.wikipedia.org/w/api.php?action=opensearch&search={requests.utils.quote(topic)}"
|
513 |
-
"&limit=1&namespace=0&format=json"
|
514 |
-
)
|
515 |
-
resp = requests.get(search_url)
|
516 |
-
if resp.status_code != 200:
|
517 |
-
print(f"[ERROR] Failed to fetch Wikipedia search results for {topic}")
|
518 |
-
return ""
|
519 |
-
data = resp.json()
|
520 |
-
if len(data) > 1 and data[1]:
|
521 |
-
title = data[1][0]
|
522 |
-
summary_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(title)}"
|
523 |
-
s_resp = requests.get(summary_url)
|
524 |
-
if s_resp.status_code == 200:
|
525 |
-
s_data = s_resp.json()
|
526 |
-
if "extract" in s_data:
|
527 |
-
print("[LOG] Wikipedia summary fetched successfully.")
|
528 |
-
return s_data["extract"]
|
529 |
-
return ""
|
530 |
-
except Exception as e:
|
531 |
-
print(f"[ERROR] Exception during Wikipedia summary fetch: {e}")
|
532 |
-
return ""
|
533 |
-
|
534 |
-
def fetch_rss_feed(feed_url: str) -> list:
|
535 |
-
print("[LOG] Fetching RSS feed:", feed_url)
|
536 |
-
try:
|
537 |
-
resp = requests.get(feed_url)
|
538 |
-
if resp.status_code != 200:
|
539 |
-
print(f"[ERROR] Failed to fetch RSS feed: {feed_url}")
|
540 |
-
return []
|
541 |
-
soup = BeautifulSoup(resp.content, "xml")
|
542 |
-
items = soup.find_all("item")
|
543 |
-
return items
|
544 |
-
except Exception as e:
|
545 |
-
print(f"[ERROR] Exception fetching RSS feed {feed_url}: {e}")
|
546 |
-
return []
|
547 |
-
|
548 |
-
def find_relevant_article(items, topic: str, min_match=2) -> tuple:
|
549 |
-
print("[LOG] Finding relevant articles...")
|
550 |
-
keywords = re.findall(r'\w+', topic.lower())
|
551 |
-
for item in items:
|
552 |
-
title = item.find("title").get_text().strip() if item.find("title") else ""
|
553 |
-
description = item.find("description").get_text().strip() if item.find("description") else ""
|
554 |
-
text = (title + " " + description).lower()
|
555 |
-
matches = sum(1 for kw in keywords if kw in text)
|
556 |
-
if matches >= min_match:
|
557 |
-
link = item.find("link").get_text().strip() if item.find("link") else ""
|
558 |
-
print(f"[LOG] Relevant article found: {title}")
|
559 |
-
return title, description, link
|
560 |
-
return None, None, None
|
561 |
-
|
562 |
-
def fetch_article_text(link: str) -> str:
|
563 |
-
print("[LOG] Fetching article text from:", link)
|
564 |
-
if not link:
|
565 |
-
print("[LOG] No link provided for article text.")
|
566 |
-
return ""
|
567 |
-
try:
|
568 |
-
resp = requests.get(link)
|
569 |
-
if resp.status_code != 200:
|
570 |
-
print(f"[ERROR] Failed to fetch article from {link}")
|
571 |
-
return ""
|
572 |
-
soup = BeautifulSoup(resp.text, 'html.parser')
|
573 |
-
paragraphs = soup.find_all("p")
|
574 |
-
text = " ".join(p.get_text() for p in paragraphs[:10]) # Fetch more paragraphs for depth
|
575 |
-
print("[LOG] Article text fetched successfully.")
|
576 |
-
return text.strip()
|
577 |
-
except Exception as e:
|
578 |
-
print(f"[ERROR] Error fetching article text: {e}")
|
579 |
-
return ""
|
580 |
-
|
581 |
-
###############################################################################
|
582 |
-
# Comprehensive Audio Generation Function
|
583 |
-
###############################################################################
|
584 |
-
def generate_audio_mp3(text: str, speaker: str) -> str:
|
585 |
-
"""
|
586 |
-
This function is correctly generating and returning the actual MP3 file path.
|
587 |
-
It utilizes Deepgram for English (American) and Murf for other languages.
|
588 |
"""
|
589 |
try:
|
590 |
import streamlit as st
|
@@ -709,54 +720,367 @@ def generate_audio_mp3(text: str, speaker: str) -> str:
|
|
709 |
print(f"[DEBUG] Generated MP3 Path: {final_mp3_path}")
|
710 |
return final_mp3_path
|
711 |
|
712 |
-
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
|
727 |
-
|
728 |
-
|
729 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
730 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
731 |
|
732 |
try:
|
733 |
-
response =
|
734 |
-
|
735 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
736 |
|
737 |
-
|
738 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
739 |
|
740 |
-
|
741 |
-
|
742 |
-
|
|
|
|
|
|
|
|
|
|
|
743 |
|
744 |
-
|
745 |
-
|
746 |
-
raise ValueError("transcriptionAsText field is missing or empty.")
|
747 |
|
748 |
-
|
749 |
-
|
750 |
-
|
751 |
-
|
|
|
|
|
|
|
|
|
752 |
|
753 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
754 |
except Exception as e:
|
755 |
-
print("[ERROR]
|
756 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
757 |
|
758 |
###############################################################################
|
759 |
-
#
|
760 |
###############################################################################
|
761 |
|
762 |
def generate_script(
|
@@ -826,7 +1150,7 @@ def parse_script_to_dialogue(script_text: str, host_name: str, guest_name: str)
|
|
826 |
List[DialogueItem]: A list of DialogueItem objects.
|
827 |
"""
|
828 |
# Define a regex pattern to identify lines like "HostName: Dialogue"
|
829 |
-
pattern =
|
830 |
matches = re.findall(pattern, script_text)
|
831 |
|
832 |
dialogue_items = []
|
@@ -844,53 +1168,6 @@ def parse_script_to_dialogue(script_text: str, host_name: str, guest_name: str)
|
|
844 |
# Additional Helper Functions (if any)
|
845 |
###############################################################################
|
846 |
|
847 |
-
def _preprocess_text_for_tts(text: str, speaker: str) -> str:
|
848 |
-
# Unchanged logic for adding filler words, etc.
|
849 |
-
text = re.sub(r"\bNo\.\b", "Number", text)
|
850 |
-
text = re.sub(r"\b(?i)SaaS\b", "sass", text)
|
851 |
-
abbreviations_as_words = {"NASA", "NATO", "UNESCO"}
|
852 |
-
|
853 |
-
def insert_periods_for_abbrev(m):
|
854 |
-
abbr = m.group(0)
|
855 |
-
if abbr in abbreviations_as_words:
|
856 |
-
return abbr
|
857 |
-
return ".".join(list(abbr)) + "."
|
858 |
-
|
859 |
-
text = re.sub(r"\b([A-Z]{2,})\b", insert_periods_for_abbrev, text)
|
860 |
-
text = re.sub(r"\.\.", ".", text)
|
861 |
-
|
862 |
-
def remove_periods_for_tts(m):
|
863 |
-
return m.group().replace(".", " ").strip()
|
864 |
-
|
865 |
-
text = re.sub(r"[A-Z]\.[A-Z](?:\.[A-Z])*\.", remove_periods_for_tts, text)
|
866 |
-
text = re.sub(r"-", " ", text)
|
867 |
-
text = re.sub(r"\b(ha(ha)?|heh|lol)\b", "(* laughs *)", text, flags=re.IGNORECASE)
|
868 |
-
text = re.sub(r"\bsigh\b", "(* sighs *)", text, flags=re.IGNORECASE)
|
869 |
-
text = re.sub(r"\b(groan|moan)\b", "(* groans *)", text, flags=re.IGNORECASE)
|
870 |
-
|
871 |
-
if speaker != "Jane":
|
872 |
-
def insert_thinking_pause(m):
|
873 |
-
word = m.group(1)
|
874 |
-
if random.random() < 0.3:
|
875 |
-
filler = random.choice(['hmm,', 'well,', 'let me see,'])
|
876 |
-
return f"{word}..., {filler}"
|
877 |
-
else:
|
878 |
-
return f"{word}...,"
|
879 |
-
|
880 |
-
keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
|
881 |
-
text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
|
882 |
-
|
883 |
-
conj_pattern = r"\b(and|but|so|because|however)\b"
|
884 |
-
text = re.sub(conj_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
|
885 |
-
|
886 |
-
text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
|
887 |
-
|
888 |
-
def capitalize_match(m):
|
889 |
-
return m.group().upper()
|
890 |
-
|
891 |
-
text = re.sub(r'(^\s*\w)|([.!?]\s*\w)', capitalize_match, text)
|
892 |
-
return text.strip()
|
893 |
-
|
894 |
def _spell_digits(d: str) -> str:
|
895 |
digit_map = {
|
896 |
'0': 'zero', '1': 'one', '2': 'two', '3': 'three',
|
@@ -899,35 +1176,6 @@ def _spell_digits(d: str) -> str:
|
|
899 |
}
|
900 |
return " ".join(digit_map[ch] for ch in d if ch in digit_map)
|
901 |
|
902 |
-
def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
|
903 |
-
# unchanged
|
904 |
-
if custom_music_path:
|
905 |
-
music_path = custom_music_path
|
906 |
-
else:
|
907 |
-
music_path = "bg_music.mp3"
|
908 |
-
|
909 |
-
if not os.path.exists(music_path):
|
910 |
-
print(f"[ERROR] Background music file not found: {music_path}")
|
911 |
-
return spoken # Return spoken audio without background music
|
912 |
-
|
913 |
-
try:
|
914 |
-
bg_music = AudioSegment.from_file(music_path, format="mp3")
|
915 |
-
except Exception as e:
|
916 |
-
print("[ERROR] Failed to load background music:", e)
|
917 |
-
return spoken
|
918 |
-
|
919 |
-
bg_music = bg_music - 18.0
|
920 |
-
total_length_ms = len(spoken) + 2000
|
921 |
-
looped_music = AudioSegment.empty()
|
922 |
-
while len(looped_music) < total_length_ms:
|
923 |
-
looped_music += bg_music
|
924 |
-
looped_music = looped_music[:total_length_ms]
|
925 |
-
final_mix = looped_music.overlay(spoken, position=2000)
|
926 |
-
return final_mix
|
927 |
-
|
928 |
###############################################################################
|
929 |
-
#
|
930 |
###############################################################################
|
931 |
-
# The perform_deep_research function is already defined above.
|
932 |
-
|
933 |
-
# No need to redefine perform_deep_research again.
|
|
|
1 |
import os
|
|
|
2 |
import json
|
3 |
+
import re
|
4 |
import requests
|
5 |
import tempfile
|
6 |
+
from typing import List
|
|
|
7 |
from pydantic import BaseModel
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
from pydub import AudioSegment, effects
|
|
|
|
|
10 |
import tiktoken
|
|
|
|
|
|
|
11 |
|
12 |
+
# Define Pydantic Models
|
|
|
|
|
|
|
|
|
|
|
13 |
class DialogueItem(BaseModel):
|
14 |
+
speaker: str
|
15 |
+
display_speaker: str
|
16 |
text: str
|
17 |
|
18 |
class Dialogue(BaseModel):
|
19 |
dialogue: List[DialogueItem]
|
20 |
|
21 |
+
###############################################################################
|
22 |
+
# Helper Functions
|
23 |
+
###############################################################################
|
24 |
+
|
25 |
+
def extract_text_from_pdf(pdf_path: str) -> str:
|
26 |
+
print("[LOG] Extracting text from PDF:", pdf_path)
|
27 |
+
try:
|
28 |
+
reader = pypdf.PdfReader(pdf_path)
|
29 |
+
text = ""
|
30 |
+
for page_num, page in enumerate(reader.pages):
|
31 |
+
page_text = page.extract_text()
|
32 |
+
if page_text:
|
33 |
+
text += page_text + "\n"
|
34 |
+
print("[LOG] Text extraction from PDF successful.")
|
35 |
+
return text
|
36 |
+
except Exception as e:
|
37 |
+
print(f"[ERROR] Failed to extract text from PDF: {e}")
|
38 |
+
return ""
|
39 |
+
|
40 |
+
def extract_text_from_url(url: str) -> str:
|
41 |
print("[LOG] Extracting text from URL:", url)
|
42 |
try:
|
43 |
headers = {
|
|
|
61 |
print(f"[ERROR] Exception during text extraction from URL: {e}")
|
62 |
return ""
|
63 |
|
64 |
+
def fetch_wikipedia_summary(topic: str) -> str:
|
65 |
+
print("[LOG] Fetching Wikipedia summary for:", topic)
|
66 |
+
try:
|
67 |
+
search_url = (
|
68 |
+
f"https://en.wikipedia.org/w/api.php?action=opensearch&search={requests.utils.quote(topic)}"
|
69 |
+
"&limit=1&namespace=0&format=json"
|
70 |
+
)
|
71 |
+
resp = requests.get(search_url)
|
72 |
+
if resp.status_code != 200:
|
73 |
+
print(f"[ERROR] Failed to fetch Wikipedia search results for {topic}")
|
74 |
+
return ""
|
75 |
+
data = resp.json()
|
76 |
+
if len(data) > 1 and data[1]:
|
77 |
+
title = data[1][0]
|
78 |
+
summary_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(title)}"
|
79 |
+
s_resp = requests.get(summary_url)
|
80 |
+
if s_resp.status_code == 200:
|
81 |
+
s_data = s_resp.json()
|
82 |
+
if "extract" in s_data:
|
83 |
+
print("[LOG] Wikipedia summary fetched successfully.")
|
84 |
+
return s_data["extract"]
|
85 |
+
return ""
|
86 |
+
except Exception as e:
|
87 |
+
print(f"[ERROR] Exception during Wikipedia summary fetch: {e}")
|
88 |
return ""
|
89 |
|
90 |
+
def fetch_rss_feed(feed_url: str) -> list:
|
91 |
+
print("[LOG] Fetching RSS feed:", feed_url)
|
92 |
+
try:
|
93 |
+
resp = requests.get(feed_url)
|
94 |
+
if resp.status_code != 200:
|
95 |
+
print(f"[ERROR] Failed to fetch RSS feed: {feed_url}")
|
96 |
+
return []
|
97 |
+
soup = BeautifulSoup(resp.content, "xml")
|
98 |
+
items = soup.find_all("item")
|
99 |
+
return items
|
100 |
+
except Exception as e:
|
101 |
+
print(f"[ERROR] Exception fetching RSS feed {feed_url}: {e}")
|
102 |
+
return []
|
103 |
|
104 |
+
def find_relevant_article(items, topic: str, min_match=2) -> tuple:
|
105 |
+
print("[LOG] Finding relevant articles...")
|
106 |
+
keywords = re.findall(r'\w+', topic.lower())
|
107 |
+
for item in items:
|
108 |
+
title = item.find("title").get_text().strip() if item.find("title") else ""
|
109 |
+
description = item.find("description").get_text().strip() if item.find("description") else ""
|
110 |
+
text = (title + " " + description).lower()
|
111 |
+
matches = sum(1 for kw in keywords if kw in text)
|
112 |
+
if matches >= min_match:
|
113 |
+
link = item.find("link").get_text().strip() if item.find("link") else ""
|
114 |
+
print(f"[LOG] Relevant article found: {title}")
|
115 |
+
return title, description, link
|
116 |
+
return None, None, None
|
117 |
|
118 |
+
def fetch_article_text(link: str) -> str:
|
119 |
+
print("[LOG] Fetching article text from:", link)
|
120 |
+
if not link:
|
121 |
+
print("[LOG] No link provided for article text.")
|
122 |
+
return ""
|
123 |
try:
|
124 |
+
resp = requests.get(link)
|
125 |
+
if resp.status_code != 200:
|
126 |
+
print(f"[ERROR] Failed to fetch article from {link}")
|
127 |
+
return ""
|
128 |
+
soup = BeautifulSoup(resp.text, 'html.parser')
|
129 |
+
paragraphs = soup.find_all("p")
|
130 |
+
text = " ".join(p.get_text() for p in paragraphs[:10]) # Fetch more paragraphs for depth
|
131 |
+
print("[LOG] Article text fetched successfully.")
|
132 |
+
return text.strip()
|
133 |
except Exception as e:
|
134 |
+
print(f"[ERROR] Error fetching article text: {e}")
|
135 |
+
return ""
|
136 |
|
|
|
|
|
|
|
137 |
def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
|
138 |
"""
|
139 |
Query https://eventregistry.org/api/v1/article/getArticles
|
|
|
182 |
print("[ERROR] Event Registry approach failed:", e)
|
183 |
return []
|
184 |
|
|
|
|
|
|
|
185 |
def fetch_bing_results(query: str, count: int = 10) -> list:
|
186 |
serp_api_key = os.environ.get("SERP_API_KEY")
|
187 |
if not serp_api_key:
|
|
|
211 |
print("[ERROR] Bing SerpApi approach failed:", e)
|
212 |
return []
|
213 |
|
214 |
+
###############################################################################
|
215 |
+
# Summarization Function
|
216 |
+
###############################################################################
|
217 |
+
|
218 |
+
def summarize_text(text: str, max_length: int = 200) -> str:
|
219 |
+
"""
|
220 |
+
Summarizes the given text to the specified maximum word length.
|
221 |
+
|
222 |
+
Args:
|
223 |
+
text (str): The text to summarize.
|
224 |
+
max_length (int): The maximum number of words in the summary.
|
225 |
+
|
226 |
+
Returns:
|
227 |
+
str: The summarized text.
|
228 |
+
"""
|
229 |
+
system_prompt = (
|
230 |
+
f"You are a professional summarizer. Please condense the following text "
|
231 |
+
f"into a summary of no more than {max_length} words, retaining the main ideas and key details."
|
232 |
+
)
|
233 |
+
user_prompt = text
|
234 |
+
|
235 |
+
try:
|
236 |
+
summary = call_deepseek_api(
|
237 |
+
system_prompt=system_prompt,
|
238 |
+
user_prompt=user_prompt,
|
239 |
+
max_tokens=500, # Adjust as needed
|
240 |
+
temperature=0.5
|
241 |
+
)
|
242 |
+
return summary.strip()
|
243 |
+
except Exception as e:
|
244 |
+
print(f"[ERROR] Summarization failed: {e}")
|
245 |
+
# Fallback: return the original text truncated to max_length words
|
246 |
+
return " ".join(text.split()[:max_length]) + "..."
|
247 |
+
|
248 |
+
###############################################################################
|
249 |
+
# Rewrites text in professional style
|
250 |
+
###############################################################################
|
251 |
+
|
252 |
+
def rewrite_in_professional_style(topic: str, raw_text: str) -> str:
|
253 |
+
if not raw_text.strip():
|
254 |
+
return ""
|
255 |
+
|
256 |
+
system_prompt = (
|
257 |
+
"You are a professional writing assistant. Your goal is to rewrite "
|
258 |
+
"the provided text so that it is:\n"
|
259 |
+
"1) Written in clear, fluent, professional English\n"
|
260 |
+
f"2) On-topic about {topic}, removing any extraneous disclaimers or filler\n"
|
261 |
+
"3) Organized in paragraphs or bullet points\n"
|
262 |
+
"4) Maintained or slightly enhanced in detail without significant summarization\n"
|
263 |
+
"5) No references to the rewriting process or disclaimers\n"
|
264 |
+
)
|
265 |
+
|
266 |
+
user_prompt = f"Please rewrite this text:\n\n{raw_text}"
|
267 |
+
|
268 |
+
try:
|
269 |
+
rewritten = call_deepseek_api(
|
270 |
+
system_prompt=system_prompt,
|
271 |
+
user_prompt=user_prompt,
|
272 |
+
max_tokens=1024,
|
273 |
+
temperature=0.7
|
274 |
+
)
|
275 |
+
# Optionally, summarize the rewritten text to further reduce token count
|
276 |
+
summary = summarize_text(rewritten, max_length=150)
|
277 |
+
return summary
|
278 |
+
except Exception as e:
|
279 |
+
print("[ERROR] rewriting text via Deepseek LLM failed:", e)
|
280 |
+
return raw_text
|
281 |
+
|
282 |
+
###############################################################################
|
283 |
+
# Event Registry (News API) aggregator
|
284 |
+
###############################################################################
|
285 |
+
# Already handled in fetch_eventregistry_articles
|
286 |
+
|
287 |
+
###############################################################################
|
288 |
+
# Bing via SerpApi
|
289 |
+
###############################################################################
|
290 |
+
# Already handled in fetch_bing_results
|
291 |
+
|
292 |
###############################################################################
|
293 |
# Unified aggregator: google + bing + wiki + rss + event registry + fallback
|
294 |
###############################################################################
|
295 |
+
|
296 |
def perform_deep_research(topic: str) -> str:
|
297 |
"""
|
298 |
+
Perform deep research by aggregating data from multiple sources.
|
299 |
+
Limits the number of sources to prevent exceeding token limits.
|
300 |
+
Summarizes each source's content to reduce token count.
|
301 |
+
|
302 |
+
Args:
|
303 |
+
topic (str): The research topic.
|
304 |
+
|
305 |
+
Returns:
|
306 |
+
str: The final professional report in Markdown format.
|
307 |
"""
|
308 |
+
# Define the maximum number of sources per aggregator
|
309 |
+
MAX_SOURCES_PER_AGGREGATOR = 5
|
310 |
+
|
311 |
# Step 1: Google
|
312 |
google_cse_id = os.environ.get("GOOGLE_CSE_ID")
|
313 |
google_api_key = os.environ.get("GOOGLE_API_KEY")
|
|
|
320 |
"q": topic,
|
321 |
"cx": google_cse_id,
|
322 |
"key": google_api_key,
|
323 |
+
"num": 10 # Fetch more to account for filtering
|
324 |
}
|
325 |
resp = requests.get(url, params=params, timeout=15)
|
326 |
resp.raise_for_status()
|
327 |
data = resp.json()
|
328 |
+
items = data.get("items", [])[:MAX_SOURCES_PER_AGGREGATOR]
|
329 |
for it in items:
|
330 |
google_sources.append({
|
331 |
"title": it.get("title", ""),
|
|
|
336 |
print("[ERROR] Google approach failed:", e)
|
337 |
|
338 |
# Step 2: Bing
|
339 |
+
bing_results = fetch_bing_results(topic, count=10)[:MAX_SOURCES_PER_AGGREGATOR]
|
340 |
|
341 |
# Step 3: Wikipedia summary
|
342 |
wiki_summary_text = fetch_wikipedia_summary(topic)
|
|
|
348 |
"snippet": wiki_summary_text
|
349 |
}
|
350 |
|
351 |
+
# Step 4: RSS approach (NewsAPI assumed here)
|
352 |
+
rss_sources = []
|
353 |
sources_dict = {
|
354 |
"BBC": "https://feeds.bbci.co.uk/news/rss.xml",
|
355 |
"CNN": "http://rss.cnn.com/rss/edition.rss",
|
|
|
359 |
"The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
|
360 |
"Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
|
361 |
}
|
362 |
+
for name, feed_url in list(sources_dict.items())[:MAX_SOURCES_PER_AGGREGATOR]:
|
|
|
363 |
try:
|
364 |
items = fetch_rss_feed(feed_url)
|
365 |
if not items:
|
|
|
384 |
continue
|
385 |
|
386 |
# Step 5: Event Registry
|
387 |
+
event_registry_res = fetch_eventregistry_articles(topic, count=10)[:MAX_SOURCES_PER_AGGREGATOR]
|
388 |
|
389 |
+
# Combine all sources
|
390 |
combined = []
|
391 |
combined.extend(google_sources)
|
392 |
combined.extend(bing_results)
|
|
|
408 |
}]
|
409 |
return _draft_professional_report(topic, fallback_data)
|
410 |
else:
|
411 |
+
# Summarize each source's snippet to reduce token count
|
412 |
+
summarized_list = []
|
413 |
+
for idx, source in enumerate(combined, start=1):
|
414 |
+
summary = summarize_text(source["snippet"], max_length=200) # Summarize to 200 words
|
415 |
+
summarized_list.append({
|
416 |
+
"index": idx,
|
417 |
+
"title": source["title"],
|
418 |
+
"link": source["link"],
|
419 |
+
"cleaned_text": summary
|
420 |
+
})
|
421 |
+
|
422 |
+
return _draft_professional_report(topic, summarized_list)
|
|
|
|
|
|
|
|
|
|
|
423 |
|
424 |
+
###############################################################################
|
425 |
+
# Professional Report Drafting Function
|
426 |
+
###############################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
427 |
|
428 |
def _draft_professional_report(topic: str, sources_list: list) -> str:
|
429 |
"""
|
430 |
+
Build a concise professional report:
|
431 |
- Title
|
432 |
- Executive Summary
|
433 |
- Introduction
|
|
|
436 |
- Conclusion
|
437 |
- References footnotes
|
438 |
Ensures at least ~1000 words.
|
439 |
+
|
440 |
+
Args:
|
441 |
+
topic (str): The research topic.
|
442 |
+
sources_list (list): List of summarized sources.
|
443 |
+
|
444 |
+
Returns:
|
445 |
+
str: The final professional report in Markdown format.
|
446 |
"""
|
447 |
merged_text = []
|
448 |
footnotes = []
|
|
|
456 |
merged_text.append(text_block)
|
457 |
all_content = "\n\n".join(merged_text)
|
458 |
|
459 |
+
# Build the system prompt
|
460 |
system_prompt = f"""You are a highly skilled professional research analyst.
|
461 |
+
You are tasked with creating a comprehensive and detailed formal research report on the topic: {topic}.
|
|
|
462 |
|
463 |
+
**Report Structure:**
|
464 |
+
1. **Title:** {topic}
|
465 |
+
2. **Executive Summary:** A concise overview of key findings and insights.
|
466 |
3. **Introduction:** Introduce the topic, its relevance, and the scope of the report.
|
467 |
4. **Main Body:**
|
468 |
+
- **Section 1:** Insights from Source 1.
|
469 |
+
- **Section 2:** Insights from Source 2.
|
470 |
+
- *(Continue as needed)*
|
471 |
+
- **Analysis:** An in-depth analysis combining information from all sources.
|
472 |
+
5. **Conclusion:** Final thoughts, implications, and potential future directions.
|
473 |
6. **References:** List all sources with numeric footnote markers [^1], [^2], etc.
|
474 |
|
475 |
**Requirements:**
|
476 |
+
- **Length:** At least 1,000 words.
|
477 |
- **Content Quality:**
|
478 |
- Incorporate relevant facts, figures, and statistics.
|
479 |
- Use professional and clear language.
|
480 |
- Ensure each section is well-developed without unnecessary repetition.
|
481 |
+
- **Structure:** Logical and cohesive flow throughout the report.
|
482 |
+
- **Formatting:** Proper formatting for headings, sub-headings, and references.
|
483 |
|
484 |
+
**Aggregated Content from Sources:**
|
485 |
-----------------------------------------------------------------------
|
486 |
{all_content}
|
487 |
-----------------------------------------------------------------------
|
|
|
498 |
# Calculate token counts
|
499 |
max_tokens = 6000 # OpenRouter's token limit
|
500 |
system_prompt_tokens = count_tokens(system_prompt)
|
|
|
|
|
501 |
|
502 |
+
print(f"[DEBUG] Total tokens before optimization: {system_prompt_tokens}")
|
503 |
|
504 |
+
if system_prompt_tokens > max_tokens:
|
505 |
# Calculate allowed tokens for all_content
|
506 |
allowed_tokens_for_content = max_tokens - system_prompt_tokens - 100 # Reserve 100 tokens buffer
|
507 |
if allowed_tokens_for_content <= 0:
|
508 |
print("[ERROR] System prompt alone exceeds the token limit.")
|
509 |
return "The system prompt exceeds the token limit. Please reduce the complexity of your research."
|
510 |
+
|
511 |
# Truncate all_content to fit
|
512 |
tokenizer = tiktoken.get_encoding("cl100k_base")
|
513 |
all_content_tokens_list = tokenizer.encode(all_content)
|
|
|
520 |
response = call_deepseek_api(
|
521 |
system_prompt=system_prompt,
|
522 |
user_prompt="", # No additional user prompt
|
523 |
+
max_tokens=3000, # Adjusted to allow more detailed output
|
524 |
temperature=0.7
|
525 |
)
|
526 |
final_report = response.strip()
|
|
|
533 |
print("[ERROR] Could not finalize professional report:", e)
|
534 |
return "An unexpected error occurred. Please try again later."
|
535 |
|
536 |
+
###############################################################################
|
537 |
+
# OpenRouter API Communication Function
|
538 |
+
###############################################################################
|
539 |
+
|
540 |
def call_deepseek_api(system_prompt: str, user_prompt: str, max_tokens: int, temperature: float) -> str:
|
541 |
"""
|
542 |
Function to call DeepSeek R1 via OpenRouter API.
|
|
|
561 |
}
|
562 |
response = requests.post("https://openrouter.ai/api/v1/chat/completions",
|
563 |
headers=headers, data=json.dumps(data))
|
564 |
+
if response.status_code != 200:
|
565 |
+
error_message = response.json().get("error", {}).get("message", "Unknown error")
|
566 |
+
print(f"[ERROR] OpenRouter API error: {response.status_code} - {error_message}")
|
567 |
+
raise ValueError(f"OpenRouter API error: {response.status_code} - {error_message}")
|
568 |
+
|
569 |
+
response_json = response.json()
|
570 |
+
if "choices" not in response_json or not response_json["choices"]:
|
571 |
+
print("[ERROR] 'choices' key missing in OpenRouter API response.")
|
572 |
+
raise ValueError("Invalid response from OpenRouter API: 'choices' key missing.")
|
573 |
+
|
574 |
+
return response_json["choices"][0]["message"]["content"]
|
575 |
except requests.exceptions.HTTPError as e:
|
576 |
status_code = e.response.status_code
|
577 |
error_content = e.response.json()
|
|
|
588 |
print("[ERROR] Could not communicate with OpenRouter API:", e)
|
589 |
raise ValueError("An unexpected error occurred. Please try again later.")
|
590 |
|
591 |
+
###############################################################################
|
592 |
+
# Comprehensive Audio Generation Function
|
593 |
+
###############################################################################
|
|
|
|
|
|
|
594 |
|
595 |
+
def generate_audio_mp3(text: str, speaker: str) -> str:
|
596 |
+
"""
|
597 |
+
Generates and returns the actual MP3 file path.
|
598 |
+
Utilizes Deepgram for English (American) and Murf for other languages.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
599 |
"""
|
600 |
try:
|
601 |
import streamlit as st
|
|
|
720 |
print(f"[DEBUG] Generated MP3 Path: {final_mp3_path}")
|
721 |
return final_mp3_path
|
722 |
|
723 |
+
def _preprocess_text_for_tts(text: str, speaker: str) -> str:
|
724 |
+
# Unchanged logic for adding filler words, etc.
|
725 |
+
text = re.sub(r"\bNo\.\b", "Number", text)
|
726 |
+
text = re.sub(r"\b(?i)SaaS\b", "sass", text)
|
727 |
+
abbreviations_as_words = {"NASA", "NATO", "UNESCO"}
|
728 |
+
|
729 |
+
def insert_periods_for_abbrev(m):
|
730 |
+
abbr = m.group(0)
|
731 |
+
if abbr in abbreviations_as_words:
|
732 |
+
return abbr
|
733 |
+
return ".".join(list(abbr)) + "."
|
734 |
+
|
735 |
+
text = re.sub(r"\b([A-Z]{2,})\b", insert_periods_for_abbrev, text)
|
736 |
+
text = re.sub(r"\.\.", ".", text)
|
737 |
+
|
738 |
+
def remove_periods_for_tts(m):
|
739 |
+
return m.group().replace(".", " ").strip()
|
740 |
+
|
741 |
+
text = re.sub(r"[A-Z]\.[A-Z](?:\.[A-Z])*\.", remove_periods_for_tts, text)
|
742 |
+
text = re.sub(r"-", " ", text)
|
743 |
+
text = re.sub(r"\b(ha(ha)?|heh|lol)\b", "(* laughs *)", text, flags=re.IGNORECASE)
|
744 |
+
text = re.sub(r"\bsigh\b", "(* sighs *)", text, flags=re.IGNORECASE)
|
745 |
+
text = re.sub(r"\b(groan|moan)\b", "(* groans *)", text, flags=re.IGNORECASE)
|
746 |
+
|
747 |
+
if speaker != "Jane":
|
748 |
+
def insert_thinking_pause(m):
|
749 |
+
word = m.group(1)
|
750 |
+
if random.random() < 0.3:
|
751 |
+
filler = random.choice(['hmm,', 'well,', 'let me see,'])
|
752 |
+
return f"{word}..., {filler}"
|
753 |
+
else:
|
754 |
+
return f"{word}...,"
|
755 |
+
|
756 |
+
keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
|
757 |
+
text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
|
758 |
+
|
759 |
+
conj_pattern = r"\b(and|but|so|because|however)\b"
|
760 |
+
text = re.sub(conj_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
|
761 |
+
|
762 |
+
text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
|
763 |
+
|
764 |
+
def capitalize_match(m):
|
765 |
+
return m.group().upper()
|
766 |
+
|
767 |
+
text = re.sub(r'(^\s*\w)|([.!?]\s*\w)', capitalize_match, text)
|
768 |
+
return text.strip()
|
769 |
+
|
770 |
+
###############################################################################
|
771 |
+
# Unified aggregator: google + bing + wiki + rss + event registry + fallback
|
772 |
+
###############################################################################
|
773 |
+
|
774 |
+
def perform_deep_research(topic: str) -> str:
|
775 |
+
"""
|
776 |
+
Perform deep research by aggregating data from multiple sources.
|
777 |
+
Limits the number of sources to prevent exceeding token limits.
|
778 |
+
Summarizes each source's content to reduce token count.
|
779 |
+
|
780 |
+
Args:
|
781 |
+
topic (str): The research topic.
|
782 |
+
|
783 |
+
Returns:
|
784 |
+
str: The final professional report in Markdown format.
|
785 |
+
"""
|
786 |
+
# Define the maximum number of sources per aggregator
|
787 |
+
MAX_SOURCES_PER_AGGREGATOR = 5
|
788 |
+
|
789 |
+
# Step 1: Google
|
790 |
+
google_cse_id = os.environ.get("GOOGLE_CSE_ID")
|
791 |
+
google_api_key = os.environ.get("GOOGLE_API_KEY")
|
792 |
+
google_sources = []
|
793 |
+
if google_cse_id and google_api_key:
|
794 |
+
try:
|
795 |
+
print("[LOG] Attempting Google CSE for topic:", topic)
|
796 |
+
url = "https://customsearch.googleapis.com/customsearch/v1"
|
797 |
+
params = {
|
798 |
+
"q": topic,
|
799 |
+
"cx": google_cse_id,
|
800 |
+
"key": google_api_key,
|
801 |
+
"num": 10 # Fetch more to account for filtering
|
802 |
+
}
|
803 |
+
resp = requests.get(url, params=params, timeout=15)
|
804 |
+
resp.raise_for_status()
|
805 |
+
data = resp.json()
|
806 |
+
items = data.get("items", [])[:MAX_SOURCES_PER_AGGREGATOR]
|
807 |
+
for it in items:
|
808 |
+
google_sources.append({
|
809 |
+
"title": it.get("title", ""),
|
810 |
+
"link": it.get("link", ""),
|
811 |
+
"snippet": it.get("snippet", "")
|
812 |
+
})
|
813 |
+
except Exception as e:
|
814 |
+
print("[ERROR] Google approach failed:", e)
|
815 |
+
|
816 |
+
# Step 2: Bing
|
817 |
+
bing_results = fetch_bing_results(topic, count=10)[:MAX_SOURCES_PER_AGGREGATOR]
|
818 |
+
|
819 |
+
# Step 3: Wikipedia summary
|
820 |
+
wiki_summary_text = fetch_wikipedia_summary(topic)
|
821 |
+
wiki_item = None
|
822 |
+
if wiki_summary_text:
|
823 |
+
wiki_item = {
|
824 |
+
"title": "Wikipedia Summary",
|
825 |
+
"link": f"https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}",
|
826 |
+
"snippet": wiki_summary_text
|
827 |
+
}
|
828 |
+
|
829 |
+
# Step 4: RSS approach (NewsAPI assumed here)
|
830 |
+
rss_sources = []
|
831 |
+
sources_dict = {
|
832 |
+
"BBC": "https://feeds.bbci.co.uk/news/rss.xml",
|
833 |
+
"CNN": "http://rss.cnn.com/rss/edition.rss",
|
834 |
+
"Associated Press": "https://apnews.com/apf-topnews",
|
835 |
+
"NDTV": "https://www.ndtv.com/rss/top-stories",
|
836 |
+
"Times of India": "https://timesofindia.indiatimes.com/rssfeeds/296589292.cms",
|
837 |
+
"The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
|
838 |
+
"Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
|
839 |
}
|
840 |
+
for name, feed_url in list(sources_dict.items())[:MAX_SOURCES_PER_AGGREGATOR]:
|
841 |
+
try:
|
842 |
+
items = fetch_rss_feed(feed_url)
|
843 |
+
if not items:
|
844 |
+
continue
|
845 |
+
title, desc, link = find_relevant_article(items, topic, min_match=2)
|
846 |
+
if link:
|
847 |
+
article_text = fetch_article_text(link)
|
848 |
+
if article_text:
|
849 |
+
rss_sources.append({
|
850 |
+
"title": f"{name} RSS Article",
|
851 |
+
"link": link,
|
852 |
+
"snippet": article_text
|
853 |
+
})
|
854 |
+
else:
|
855 |
+
rss_sources.append({
|
856 |
+
"title": f"{name} RSS Article",
|
857 |
+
"link": link,
|
858 |
+
"snippet": f"{title} - {desc}"
|
859 |
+
})
|
860 |
+
except Exception as e:
|
861 |
+
print(f"[ERROR] Error fetching from {name} RSS feed:", e)
|
862 |
+
continue
|
863 |
+
|
864 |
+
# Step 5: Event Registry
|
865 |
+
event_registry_res = fetch_eventregistry_articles(topic, count=10)[:MAX_SOURCES_PER_AGGREGATOR]
|
866 |
+
|
867 |
+
# Combine all sources
|
868 |
+
combined = []
|
869 |
+
combined.extend(google_sources)
|
870 |
+
combined.extend(bing_results)
|
871 |
+
if wiki_item:
|
872 |
+
combined.append(wiki_item)
|
873 |
+
combined.extend(rss_sources)
|
874 |
+
combined.extend(event_registry_res)
|
875 |
+
|
876 |
+
if not combined:
|
877 |
+
print("[LOG] No results found from aggregator. Using LLM fallback.")
|
878 |
+
# LLM-based fallback
|
879 |
+
fallback_text = query_llm_for_additional_info(topic, "")
|
880 |
+
cleaned_fb = rewrite_in_professional_style(topic, fallback_text)
|
881 |
+
fallback_data = [{
|
882 |
+
"index": 1,
|
883 |
+
"title": "Fallback Info",
|
884 |
+
"link": "N/A",
|
885 |
+
"cleaned_text": cleaned_fb
|
886 |
+
}]
|
887 |
+
return _draft_professional_report(topic, fallback_data)
|
888 |
+
else:
|
889 |
+
# Summarize each source's snippet to reduce token count
|
890 |
+
summarized_list = []
|
891 |
+
for idx, source in enumerate(combined, start=1):
|
892 |
+
summary = summarize_text(source["snippet"], max_length=200) # Summarize to 200 words
|
893 |
+
summarized_list.append({
|
894 |
+
"index": idx,
|
895 |
+
"title": source["title"],
|
896 |
+
"link": source["link"],
|
897 |
+
"cleaned_text": summary
|
898 |
+
})
|
899 |
+
|
900 |
+
return _draft_professional_report(topic, summarized_list)
|
901 |
+
|
902 |
+
###############################################################################
|
903 |
+
# Professional Report Drafting Function
|
904 |
+
###############################################################################
|
905 |
+
|
906 |
+
def _draft_professional_report(topic: str, sources_list: list) -> str:
|
907 |
+
"""
|
908 |
+
Build a concise professional report:
|
909 |
+
- Title
|
910 |
+
- Executive Summary
|
911 |
+
- Introduction
|
912 |
+
- Main Body with sub-headings
|
913 |
+
- Analysis
|
914 |
+
- Conclusion
|
915 |
+
- References footnotes
|
916 |
+
Ensures at least ~1000 words.
|
917 |
+
|
918 |
+
Args:
|
919 |
+
topic (str): The research topic.
|
920 |
+
sources_list (list): List of summarized sources.
|
921 |
+
|
922 |
+
Returns:
|
923 |
+
str: The final professional report in Markdown format.
|
924 |
+
"""
|
925 |
+
merged_text = []
|
926 |
+
footnotes = []
|
927 |
+
for s in sources_list:
|
928 |
+
footnotes.append(f"[^{s['index']}]: {s['link']}")
|
929 |
+
text_block = (
|
930 |
+
f"Source {s['index']} Title: {s['title']}\n"
|
931 |
+
f"FootnoteRef: [^{s['index']}]\n"
|
932 |
+
f"Text:\n{s['cleaned_text']}\n"
|
933 |
+
)
|
934 |
+
merged_text.append(text_block)
|
935 |
+
all_content = "\n\n".join(merged_text)
|
936 |
+
|
937 |
+
# Build the system prompt
|
938 |
+
system_prompt = f"""You are a highly skilled professional research analyst.
|
939 |
+
You are tasked with creating a comprehensive and detailed formal research report on the topic: {topic}.
|
940 |
+
|
941 |
+
**Report Structure:**
|
942 |
+
1. **Title:** {topic}
|
943 |
+
2. **Executive Summary:** A concise overview of key findings and insights.
|
944 |
+
3. **Introduction:** Introduce the topic, its relevance, and the scope of the report.
|
945 |
+
4. **Main Body:**
|
946 |
+
- **Section 1:** Insights from Source 1.
|
947 |
+
- **Section 2:** Insights from Source 2.
|
948 |
+
- *(Continue as needed)*
|
949 |
+
- **Analysis:** An in-depth analysis combining information from all sources.
|
950 |
+
5. **Conclusion:** Final thoughts, implications, and potential future directions.
|
951 |
+
6. **References:** List all sources with numeric footnote markers [^1], [^2], etc.
|
952 |
+
|
953 |
+
**Requirements:**
|
954 |
+
- **Length:** At least 1,000 words.
|
955 |
+
- **Content Quality:**
|
956 |
+
- Incorporate relevant facts, figures, and statistics.
|
957 |
+
- Use professional and clear language.
|
958 |
+
- Ensure each section is well-developed without unnecessary repetition.
|
959 |
+
- **Structure:** Logical and cohesive flow throughout the report.
|
960 |
+
- **Formatting:** Proper formatting for headings, sub-headings, and references.
|
961 |
+
|
962 |
+
**Aggregated Content from Sources:**
|
963 |
+
-----------------------------------------------------------------------
|
964 |
+
{all_content}
|
965 |
+
-----------------------------------------------------------------------
|
966 |
+
**Footnotes:**
|
967 |
+
{chr(10).join(footnotes)}
|
968 |
+
"""
|
969 |
+
|
970 |
+
# Token Counting Function
|
971 |
+
def count_tokens(text: str) -> int:
|
972 |
+
tokenizer = tiktoken.get_encoding("cl100k_base")
|
973 |
+
tokens = tokenizer.encode(text)
|
974 |
+
return len(tokens)
|
975 |
+
|
976 |
+
# Calculate token counts
|
977 |
+
max_tokens = 6000 # OpenRouter's token limit
|
978 |
+
system_prompt_tokens = count_tokens(system_prompt)
|
979 |
+
|
980 |
+
print(f"[DEBUG] Total tokens before optimization: {system_prompt_tokens}")
|
981 |
+
|
982 |
+
if system_prompt_tokens > max_tokens:
|
983 |
+
# Calculate allowed tokens for all_content
|
984 |
+
allowed_tokens_for_content = max_tokens - system_prompt_tokens - 100 # Reserve 100 tokens buffer
|
985 |
+
if allowed_tokens_for_content <= 0:
|
986 |
+
print("[ERROR] System prompt alone exceeds the token limit.")
|
987 |
+
return "The system prompt exceeds the token limit. Please reduce the complexity of your research."
|
988 |
+
|
989 |
+
# Truncate all_content to fit
|
990 |
+
tokenizer = tiktoken.get_encoding("cl100k_base")
|
991 |
+
all_content_tokens_list = tokenizer.encode(all_content)
|
992 |
+
truncated_tokens = all_content_tokens_list[:allowed_tokens_for_content]
|
993 |
+
truncated_content = tokenizer.decode(truncated_tokens)
|
994 |
+
system_prompt = system_prompt.replace(all_content, truncated_content + "\n\n[Content truncated to fit token limits.]")
|
995 |
+
print(f"[DEBUG] Truncated content to fit token limits: {len(truncated_tokens)} tokens")
|
996 |
|
997 |
try:
|
998 |
+
response = call_deepseek_api(
|
999 |
+
system_prompt=system_prompt,
|
1000 |
+
user_prompt="", # No additional user prompt
|
1001 |
+
max_tokens=3000, # Adjusted to allow more detailed output
|
1002 |
+
temperature=0.7
|
1003 |
+
)
|
1004 |
+
final_report = response.strip()
|
1005 |
+
# Optionally, check word count
|
1006 |
+
word_count = len(final_report.split())
|
1007 |
+
if word_count < 1000:
|
1008 |
+
print(f"[WARNING] Generated report is below desired length: {word_count} words.")
|
1009 |
+
return final_report
|
1010 |
+
except Exception as e:
|
1011 |
+
print("[ERROR] Could not finalize professional report:", e)
|
1012 |
+
return "An unexpected error occurred. Please try again later."
|
1013 |
|
1014 |
+
###############################################################################
|
1015 |
+
# PDF Generation Function
|
1016 |
+
###############################################################################
|
1017 |
+
|
1018 |
+
def generate_pdf_from_markdown(markdown_text: str) -> bytes:
|
1019 |
+
"""
|
1020 |
+
Converts Markdown text to a PDF file.
|
1021 |
+
|
1022 |
+
Args:
|
1023 |
+
markdown_text (str): The Markdown content to convert.
|
1024 |
|
1025 |
+
Returns:
|
1026 |
+
bytes: The generated PDF file in bytes.
|
1027 |
+
"""
|
1028 |
+
try:
|
1029 |
+
# Convert Markdown to HTML
|
1030 |
+
import markdown
|
1031 |
+
import pdfkit
|
1032 |
+
html = markdown.markdown(markdown_text)
|
1033 |
|
1034 |
+
# Generate PDF from HTML
|
1035 |
+
pdf_bytes = pdfkit.from_string(html, False) # False to return as bytes
|
|
|
1036 |
|
1037 |
+
return pdf_bytes
|
1038 |
+
except Exception as e:
|
1039 |
+
print(f"[ERROR] Failed to generate PDF from Markdown: {e}")
|
1040 |
+
return b""
|
1041 |
+
|
1042 |
+
###############################################################################
|
1043 |
+
# Audio Mixing Function
|
1044 |
+
###############################################################################
|
1045 |
|
1046 |
+
def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
|
1047 |
+
"""
|
1048 |
+
Mixes spoken audio with background music.
|
1049 |
+
|
1050 |
+
Args:
|
1051 |
+
spoken (AudioSegment): The spoken audio segment.
|
1052 |
+
custom_music_path (str, optional): Path to custom background music. Defaults to None.
|
1053 |
+
|
1054 |
+
Returns:
|
1055 |
+
AudioSegment: The mixed audio segment.
|
1056 |
+
"""
|
1057 |
+
# unchanged
|
1058 |
+
if custom_music_path:
|
1059 |
+
music_path = custom_music_path
|
1060 |
+
else:
|
1061 |
+
music_path = "bg_music.mp3"
|
1062 |
+
|
1063 |
+
if not os.path.exists(music_path):
|
1064 |
+
print(f"[ERROR] Background music file not found: {music_path}")
|
1065 |
+
return spoken # Return spoken audio without background music
|
1066 |
+
|
1067 |
+
try:
|
1068 |
+
bg_music = AudioSegment.from_file(music_path, format="mp3")
|
1069 |
except Exception as e:
|
1070 |
+
print("[ERROR] Failed to load background music:", e)
|
1071 |
+
return spoken
|
1072 |
+
|
1073 |
+
bg_music = bg_music - 18.0
|
1074 |
+
total_length_ms = len(spoken) + 2000
|
1075 |
+
looped_music = AudioSegment.empty()
|
1076 |
+
while len(looped_music) < total_length_ms:
|
1077 |
+
looped_music += bg_music
|
1078 |
+
looped_music = looped_music[:total_length_ms]
|
1079 |
+
final_mix = looped_music.overlay(spoken, position=2000)
|
1080 |
+
return final_mix
|
1081 |
|
1082 |
###############################################################################
|
1083 |
+
# Generate Script Function and Helper
|
1084 |
###############################################################################
|
1085 |
|
1086 |
def generate_script(
|
|
|
1150 |
List[DialogueItem]: A list of DialogueItem objects.
|
1151 |
"""
|
1152 |
# Define a regex pattern to identify lines like "HostName: Dialogue"
|
1153 |
+
pattern = rf"(?i)\b({re.escape(host_name)}|{re.escape(guest_name)})\b:\s*(.*)"
|
1154 |
matches = re.findall(pattern, script_text)
|
1155 |
|
1156 |
dialogue_items = []
|
|
|
1168 |
# Additional Helper Functions (if any)
|
1169 |
###############################################################################
|
1170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1171 |
def _spell_digits(d: str) -> str:
|
1172 |
digit_map = {
|
1173 |
'0': 'zero', '1': 'one', '2': 'two', '3': 'three',
|
|
|
1176 |
}
|
1177 |
return " ".join(digit_map[ch] for ch in d if ch in digit_map)
|
1178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1179 |
###############################################################################
|
1180 |
+
# End of utils.py
|
1181 |
###############################################################################
|
|
|
|
|
|