Spaces:
Running
Running
Update utils.py
Browse files
utils.py
CHANGED
@@ -1,59 +1,56 @@
|
|
1 |
import os
|
2 |
-
import json
|
3 |
import re
|
|
|
4 |
import requests
|
5 |
import tempfile
|
6 |
-
import time
|
7 |
-
import logging
|
8 |
-
from typing import List
|
9 |
-
from pydantic import BaseModel
|
10 |
from bs4 import BeautifulSoup
|
|
|
|
|
11 |
from pydub import AudioSegment, effects
|
|
|
|
|
12 |
import tiktoken
|
13 |
-
import
|
14 |
-
import
|
15 |
-
import pdfkit
|
16 |
import random
|
17 |
-
import
|
18 |
-
from
|
19 |
-
|
20 |
-
import
|
21 |
-
|
22 |
-
# Suppress Cryptography Deprecation Warnings
|
23 |
-
warnings.filterwarnings("ignore", category=CryptographyDeprecationWarning)
|
24 |
-
|
25 |
-
# Configure Logging
|
26 |
-
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
|
27 |
|
28 |
-
#
|
|
|
|
|
29 |
class DialogueItem(BaseModel):
|
30 |
-
speaker:
|
31 |
-
display_speaker: str
|
32 |
text: str
|
33 |
|
34 |
class Dialogue(BaseModel):
|
35 |
dialogue: List[DialogueItem]
|
36 |
|
37 |
-
|
38 |
-
#
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
57 |
print("[LOG] Extracting text from URL:", url)
|
58 |
try:
|
59 |
headers = {
|
@@ -77,120 +74,77 @@ def extract_text_from_url(url: str) -> str:
|
|
77 |
print(f"[ERROR] Exception during text extraction from URL: {e}")
|
78 |
return ""
|
79 |
|
80 |
-
def
|
81 |
-
print("[LOG]
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
"&limit=1&namespace=0&format=json"
|
86 |
-
)
|
87 |
-
resp = requests.get(search_url)
|
88 |
-
if resp.status_code != 200:
|
89 |
-
print(f"[ERROR] Failed to fetch Wikipedia search results for {topic}")
|
90 |
-
return ""
|
91 |
-
data = resp.json()
|
92 |
-
if len(data) > 1 and data[1]:
|
93 |
-
title = data[1][0]
|
94 |
-
summary_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(title)}"
|
95 |
-
s_resp = requests.get(summary_url)
|
96 |
-
if s_resp.status_code == 200:
|
97 |
-
s_data = s_resp.json()
|
98 |
-
if "extract" in s_data:
|
99 |
-
print("[LOG] Wikipedia summary fetched successfully.")
|
100 |
-
return s_data["extract"]
|
101 |
-
return ""
|
102 |
-
except Exception as e:
|
103 |
-
print(f"[ERROR] Exception during Wikipedia summary fetch: {e}")
|
104 |
-
return ""
|
105 |
|
106 |
-
def
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
if resp.status_code != 200:
|
111 |
-
print(f"[ERROR] Failed to fetch RSS feed: {feed_url}")
|
112 |
-
return []
|
113 |
-
soup = BeautifulSoup(resp.content, "xml")
|
114 |
-
items = soup.find_all("item")
|
115 |
-
return items
|
116 |
-
except Exception as e:
|
117 |
-
print(f"[ERROR] Exception fetching RSS feed {feed_url}: {e}")
|
118 |
-
return []
|
119 |
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
description = item.find("description").get_text().strip() if item.find("description") else ""
|
126 |
-
text = (title + " " + description).lower()
|
127 |
-
matches = sum(1 for kw in keywords if kw in text)
|
128 |
-
if matches >= min_match:
|
129 |
-
link = item.find("link").get_text().strip() if item.find("link") else ""
|
130 |
-
print(f"[LOG] Relevant article found: {title}")
|
131 |
-
return title, description, link
|
132 |
-
return None, None, None
|
133 |
-
|
134 |
-
def fetch_article_text(link: str) -> str:
|
135 |
-
print("[LOG] Fetching article text from:", link)
|
136 |
-
if not link:
|
137 |
-
print("[LOG] No link provided for article text.")
|
138 |
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
try:
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
print("[LOG] Article text fetched successfully.")
|
148 |
-
return text.strip()
|
149 |
except Exception as e:
|
150 |
-
print(
|
151 |
-
return
|
152 |
|
|
|
|
|
|
|
153 |
def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
|
154 |
-
"""
|
155 |
-
Query https://eventregistry.org/api/v1/article/getArticles
|
156 |
-
with the env var NEWS_API_KEY, searching for 'topic'.
|
157 |
-
Return list of {title, link, snippet}.
|
158 |
-
"""
|
159 |
news_api_key = os.environ.get("NEWS_API_KEY")
|
160 |
if not news_api_key:
|
161 |
print("[ERROR] Missing NEWS_API_KEY for Event Registry.")
|
162 |
return []
|
163 |
-
|
164 |
print("[LOG] Attempting Event Registry for topic:", topic)
|
165 |
endpoint = "https://eventregistry.org/api/v1/article/getArticles"
|
166 |
-
# Minimal example request body
|
167 |
body = {
|
168 |
"action": "getArticles",
|
169 |
"keyword": topic,
|
170 |
"articlesPage": 1,
|
171 |
-
"articlesCount": count,
|
172 |
"articlesSortBy": "date",
|
173 |
"articlesSortByAsc": False,
|
174 |
"dataType": ["news", "pr"],
|
175 |
-
"forceMaxDataTimeWindow": 31,
|
176 |
"resultType": "articles",
|
177 |
"apiKey": news_api_key
|
178 |
}
|
179 |
-
|
180 |
try:
|
181 |
resp = requests.post(endpoint, json=body, timeout=20)
|
182 |
resp.raise_for_status()
|
183 |
data = resp.json()
|
184 |
-
# According to docs, articles can be found at data["articles"]["results"]
|
185 |
art_data = data.get("articles", {})
|
186 |
results_arr = art_data.get("results", [])
|
187 |
-
|
188 |
ret = []
|
189 |
for item in results_arr:
|
190 |
-
# item might have "title", "url", "body" or "titleUri"
|
191 |
title = item.get("title", "")
|
192 |
url = item.get("url", "")
|
193 |
-
# pick either "body" or "excerpt"
|
194 |
snippet = item.get("body", "") or item.get("excerpt", "")
|
195 |
ret.append({"title": title, "link": url, "snippet": snippet})
|
196 |
return ret
|
@@ -198,6 +152,9 @@ def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
|
|
198 |
print("[ERROR] Event Registry approach failed:", e)
|
199 |
return []
|
200 |
|
|
|
|
|
|
|
201 |
def fetch_bing_results(query: str, count: int = 10) -> list:
|
202 |
serp_api_key = os.environ.get("SERP_API_KEY")
|
203 |
if not serp_api_key:
|
@@ -227,635 +184,11 @@ def fetch_bing_results(query: str, count: int = 10) -> list:
|
|
227 |
print("[ERROR] Bing SerpApi approach failed:", e)
|
228 |
return []
|
229 |
|
230 |
-
|
231 |
-
#
|
232 |
-
|
233 |
-
|
234 |
-
def summarize_text(text: str, max_length: int = 200) -> str:
|
235 |
-
"""
|
236 |
-
Summarizes the given text to the specified maximum word length.
|
237 |
-
|
238 |
-
Args:
|
239 |
-
text (str): The text to summarize.
|
240 |
-
max_length (int): The maximum number of words in the summary.
|
241 |
-
|
242 |
-
Returns:
|
243 |
-
str: The summarized text.
|
244 |
-
"""
|
245 |
-
system_prompt = (
|
246 |
-
f"You are a professional summarizer. Please condense the following text "
|
247 |
-
f"into a summary of no more than {max_length} words, retaining the main ideas and key details."
|
248 |
-
)
|
249 |
-
user_prompt = text
|
250 |
-
|
251 |
-
try:
|
252 |
-
summary = call_deepseek_api_cached(
|
253 |
-
system_prompt=system_prompt,
|
254 |
-
user_prompt=user_prompt,
|
255 |
-
max_tokens=500, # Adjust as needed
|
256 |
-
temperature=0.5
|
257 |
-
)
|
258 |
-
return summary.strip()
|
259 |
-
except Exception as e:
|
260 |
-
print(f"[ERROR] Summarization failed: {e}")
|
261 |
-
# Fallback: return the original text truncated to max_length words
|
262 |
-
return " ".join(text.split()[:max_length]) + "..."
|
263 |
-
|
264 |
-
###############################################################################
|
265 |
-
# Rewrites text in professional style
|
266 |
-
###############################################################################
|
267 |
-
|
268 |
-
def rewrite_in_professional_style(topic: str, raw_text: str) -> str:
|
269 |
-
if not raw_text.strip():
|
270 |
-
return ""
|
271 |
-
|
272 |
-
system_prompt = (
|
273 |
-
"You are a professional writing assistant. Your goal is to rewrite "
|
274 |
-
"the provided text so that it is:\n"
|
275 |
-
"1) Written in clear, fluent, professional English\n"
|
276 |
-
f"2) On-topic about {topic}, removing any extraneous disclaimers or filler\n"
|
277 |
-
"3) Organized in paragraphs or bullet points\n"
|
278 |
-
"4) Maintained or slightly enhanced in detail without significant summarization\n"
|
279 |
-
"5) No references to the rewriting process or disclaimers\n"
|
280 |
-
)
|
281 |
-
|
282 |
-
user_prompt = f"Please rewrite this text:\n\n{raw_text}"
|
283 |
-
|
284 |
-
try:
|
285 |
-
rewritten = call_deepseek_api_cached(
|
286 |
-
system_prompt=system_prompt,
|
287 |
-
user_prompt=user_prompt,
|
288 |
-
max_tokens=1024,
|
289 |
-
temperature=0.7
|
290 |
-
)
|
291 |
-
# Optionally, summarize the rewritten text to further reduce token count
|
292 |
-
summary = summarize_text(rewritten, max_length=150)
|
293 |
-
return summary
|
294 |
-
except Exception as e:
|
295 |
-
print("[ERROR] Rewriting text via Deepseek LLM failed:", e)
|
296 |
-
return raw_text
|
297 |
-
|
298 |
-
###############################################################################
|
299 |
-
# OpenRouter API Communication Function with Exponential Backoff and Rate Limiting
|
300 |
-
###############################################################################
|
301 |
-
|
302 |
-
ONE_MINUTE = 60
|
303 |
-
|
304 |
-
@sleep_and_retry
|
305 |
-
@limits(calls=5, period=ONE_MINUTE) # Adjust based on OpenRouter's rate limits
|
306 |
-
def call_deepseek_api(system_prompt: str, user_prompt: str, max_tokens: int, temperature: float, max_retries: int = 5) -> str:
|
307 |
-
"""
|
308 |
-
Function to call DeepSeek R1 via OpenRouter API with exponential backoff for rate limiting.
|
309 |
-
"""
|
310 |
-
logging.info("Communicating with DeepSeek R1 via OpenRouter API.")
|
311 |
-
headers = {
|
312 |
-
"Authorization": f"Bearer {os.environ.get('DEEPSEEK_API_KEY')}",
|
313 |
-
"Content-Type": "application/json",
|
314 |
-
# Optional headers for OpenRouter leaderboard
|
315 |
-
# "HTTP-Referer": "<YOUR_SITE_URL>",
|
316 |
-
# "X-Title": "<YOUR_SITE_NAME>",
|
317 |
-
}
|
318 |
-
data = {
|
319 |
-
"model": "deepseek/deepseek-r1:free", # Ensure this model name is correct
|
320 |
-
"messages": [
|
321 |
-
{"role": "system", "content": system_prompt},
|
322 |
-
{"role": "user", "content": user_prompt}
|
323 |
-
],
|
324 |
-
"max_tokens": max_tokens,
|
325 |
-
"temperature": temperature
|
326 |
-
}
|
327 |
-
|
328 |
-
attempt = 0
|
329 |
-
backoff_time = 1 # Start with 1 second
|
330 |
-
|
331 |
-
while attempt < max_retries:
|
332 |
-
try:
|
333 |
-
response = requests.post("https://openrouter.ai/api/v1/chat/completions",
|
334 |
-
headers=headers, data=json.dumps(data))
|
335 |
-
|
336 |
-
logging.debug(f"OpenRouter API Response Status: {response.status_code}")
|
337 |
-
logging.debug(f"OpenRouter API Response Body: {response.text}")
|
338 |
-
|
339 |
-
if response.status_code == 200:
|
340 |
-
response_json = response.json()
|
341 |
-
if "choices" in response_json and response_json["choices"]:
|
342 |
-
return response_json["choices"][0]["message"]["content"]
|
343 |
-
else:
|
344 |
-
logging.error("'choices' key missing in OpenRouter API response.")
|
345 |
-
raise ValueError("Invalid response from OpenRouter API: 'choices' key missing.")
|
346 |
-
elif response.status_code == 429:
|
347 |
-
# Rate limit exceeded
|
348 |
-
retry_after = response.headers.get("Retry-After")
|
349 |
-
if retry_after:
|
350 |
-
wait_time = int(retry_after)
|
351 |
-
else:
|
352 |
-
wait_time = backoff_time
|
353 |
-
logging.warning(f"Rate limit exceeded. Attempt {attempt + 1} of {max_retries}. Retrying in {wait_time} seconds...")
|
354 |
-
time.sleep(wait_time)
|
355 |
-
backoff_time *= 2 # Exponential backoff
|
356 |
-
attempt += 1
|
357 |
-
else:
|
358 |
-
# Handle other HTTP errors
|
359 |
-
try:
|
360 |
-
error_message = response.json().get("error", {}).get("message", "Unknown error")
|
361 |
-
except json.JSONDecodeError:
|
362 |
-
error_message = "Non-JSON response received."
|
363 |
-
logging.error(f"OpenRouter API error: {response.status_code} - {error_message}")
|
364 |
-
raise ValueError(f"OpenRouter API error: {response.status_code} - {error_message}")
|
365 |
-
|
366 |
-
except requests.exceptions.RequestException as e:
|
367 |
-
logging.error(f"Request exception: {e}. Attempt {attempt + 1} of {max_retries}. Retrying in {backoff_time} seconds...")
|
368 |
-
time.sleep(backoff_time)
|
369 |
-
backoff_time *= 2
|
370 |
-
attempt += 1
|
371 |
-
|
372 |
-
# After max retries
|
373 |
-
logging.error("Max retries exceeded. Failed to get a valid response from OpenRouter API.")
|
374 |
-
raise ValueError("Rate limit exceeded. Please try again later.")
|
375 |
-
|
376 |
-
@st.cache_data(show_spinner=False)
|
377 |
-
def call_deepseek_api_cached(system_prompt: str, user_prompt: str, max_tokens: int, temperature: float) -> str:
|
378 |
-
return call_deepseek_api(system_prompt, user_prompt, max_tokens, temperature)
|
379 |
-
|
380 |
-
###############################################################################
|
381 |
-
# Professional Report Drafting Function
|
382 |
-
###############################################################################
|
383 |
-
|
384 |
-
def _draft_professional_report(topic: str, sources_list: list) -> str:
|
385 |
-
"""
|
386 |
-
Build a concise professional report:
|
387 |
-
- Title
|
388 |
-
- Executive Summary
|
389 |
-
- Introduction
|
390 |
-
- Main Body with sub-headings
|
391 |
-
- Analysis
|
392 |
-
- Conclusion
|
393 |
-
- References footnotes
|
394 |
-
Ensures at least ~1000 words.
|
395 |
-
|
396 |
-
Args:
|
397 |
-
topic (str): The research topic.
|
398 |
-
sources_list (list): List of summarized sources.
|
399 |
-
|
400 |
-
Returns:
|
401 |
-
str: The final professional report in Markdown format.
|
402 |
-
"""
|
403 |
-
merged_text = []
|
404 |
-
footnotes = []
|
405 |
-
for s in sources_list:
|
406 |
-
footnotes.append(f"[^{s['index']}]: {s['link']}")
|
407 |
-
text_block = (
|
408 |
-
f"Source {s['index']} Title: {s['title']}\n"
|
409 |
-
f"FootnoteRef: [^{s['index']}]\n"
|
410 |
-
f"Text:\n{s['cleaned_text']}\n"
|
411 |
-
)
|
412 |
-
merged_text.append(text_block)
|
413 |
-
all_content = "\n\n".join(merged_text)
|
414 |
-
|
415 |
-
# Build the system prompt
|
416 |
-
system_prompt = f"""You are a highly skilled professional research analyst.
|
417 |
-
You are tasked with creating a comprehensive and detailed formal research report on the topic: {topic}.
|
418 |
-
|
419 |
-
**Report Structure:**
|
420 |
-
1. **Title:** {topic}
|
421 |
-
2. **Executive Summary:** A concise overview of key findings and insights.
|
422 |
-
3. **Introduction:** Introduce the topic, its relevance, and the scope of the report.
|
423 |
-
4. **Main Body:**
|
424 |
-
- **Section 1:** Insights from Source 1.
|
425 |
-
- **Section 2:** Insights from Source 2.
|
426 |
-
- *(Continue as needed)*
|
427 |
-
- **Analysis:** An in-depth analysis combining information from all sources.
|
428 |
-
5. **Conclusion:** Final thoughts, implications, and potential future directions.
|
429 |
-
6. **References:** List all sources with numeric footnote markers [^1], [^2], etc.
|
430 |
-
|
431 |
-
**Requirements:**
|
432 |
-
- **Length:** At least 1,000 words.
|
433 |
-
- **Content Quality:**
|
434 |
-
- Incorporate relevant facts, figures, and statistics.
|
435 |
-
- Use professional and clear language.
|
436 |
-
- Ensure each section is well-developed without unnecessary repetition.
|
437 |
-
- **Structure:** Logical and cohesive flow throughout the report.
|
438 |
-
- **Formatting:** Proper formatting for headings, sub-headings, and references.
|
439 |
-
|
440 |
-
**Aggregated Content from Sources:**
|
441 |
-
-----------------------------------------------------------------------
|
442 |
-
{all_content}
|
443 |
-
-----------------------------------------------------------------------
|
444 |
-
**Footnotes:**
|
445 |
-
{chr(10).join(footnotes)}
|
446 |
-
"""
|
447 |
-
|
448 |
-
# Token Counting Function
|
449 |
-
def count_tokens(text: str) -> int:
|
450 |
-
tokenizer = tiktoken.get_encoding("cl100k_base")
|
451 |
-
tokens = tokenizer.encode(text)
|
452 |
-
return len(tokens)
|
453 |
-
|
454 |
-
# Calculate token counts
|
455 |
-
max_tokens = 6000 # OpenRouter's token limit
|
456 |
-
system_prompt_tokens = count_tokens(system_prompt)
|
457 |
-
|
458 |
-
logging.debug(f"Total tokens before optimization: {system_prompt_tokens}")
|
459 |
-
|
460 |
-
if system_prompt_tokens > max_tokens:
|
461 |
-
# Calculate allowed tokens for all_content
|
462 |
-
allowed_tokens_for_content = max_tokens - system_prompt_tokens - 100 # Reserve 100 tokens buffer
|
463 |
-
if allowed_tokens_for_content <= 0:
|
464 |
-
logging.error("System prompt alone exceeds the token limit.")
|
465 |
-
return "The system prompt exceeds the token limit. Please reduce the complexity of your research."
|
466 |
-
|
467 |
-
# Truncate all_content to fit
|
468 |
-
tokenizer = tiktoken.get_encoding("cl100k_base")
|
469 |
-
all_content_tokens_list = tokenizer.encode(all_content)
|
470 |
-
truncated_tokens = all_content_tokens_list[:allowed_tokens_for_content]
|
471 |
-
truncated_content = tokenizer.decode(truncated_tokens)
|
472 |
-
system_prompt = system_prompt.replace(all_content, truncated_content + "\n\n[Content truncated to fit token limits.]")
|
473 |
-
logging.debug(f"Truncated content to fit token limits: {len(truncated_tokens)} tokens")
|
474 |
-
|
475 |
-
try:
|
476 |
-
response = call_deepseek_api_cached(
|
477 |
-
system_prompt=system_prompt,
|
478 |
-
user_prompt="", # No additional user prompt
|
479 |
-
max_tokens=3000, # Adjusted to allow more detailed output
|
480 |
-
temperature=0.7
|
481 |
-
)
|
482 |
-
final_report = response.strip()
|
483 |
-
# Optionally, check word count
|
484 |
-
word_count = len(final_report.split())
|
485 |
-
if word_count < 1000:
|
486 |
-
logging.warning(f"Generated report is below desired length: {word_count} words.")
|
487 |
-
return final_report
|
488 |
-
except Exception as e:
|
489 |
-
logging.error(f"Could not finalize professional report: {e}")
|
490 |
-
return "An unexpected error occurred. Please try again later."
|
491 |
-
|
492 |
-
###############################################################################
|
493 |
-
# PDF Generation Function
|
494 |
-
###############################################################################
|
495 |
-
|
496 |
-
def generate_pdf_from_markdown(markdown_text: str) -> bytes:
|
497 |
-
"""
|
498 |
-
Converts Markdown text to a PDF file.
|
499 |
-
|
500 |
-
Args:
|
501 |
-
markdown_text (str): The Markdown content to convert.
|
502 |
-
|
503 |
-
Returns:
|
504 |
-
bytes: The generated PDF file in bytes.
|
505 |
-
"""
|
506 |
-
try:
|
507 |
-
# Convert Markdown to HTML
|
508 |
-
html = markdown.markdown(markdown_text)
|
509 |
-
|
510 |
-
# Generate PDF from HTML
|
511 |
-
pdf_bytes = pdfkit.from_string(html, False) # False to return as bytes
|
512 |
-
|
513 |
-
return pdf_bytes
|
514 |
-
except Exception as e:
|
515 |
-
print(f"[ERROR] Failed to generate PDF from Markdown: {e}")
|
516 |
-
return b""
|
517 |
-
|
518 |
-
###############################################################################
|
519 |
-
# Audio Mixing Function
|
520 |
-
###############################################################################
|
521 |
-
|
522 |
-
def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
|
523 |
-
"""
|
524 |
-
Mixes spoken audio with background music.
|
525 |
-
|
526 |
-
Args:
|
527 |
-
spoken (AudioSegment): The spoken audio segment.
|
528 |
-
custom_music_path (str, optional): Path to custom background music. Defaults to None.
|
529 |
-
|
530 |
-
Returns:
|
531 |
-
AudioSegment: The mixed audio segment.
|
532 |
-
"""
|
533 |
-
if custom_music_path:
|
534 |
-
music_path = custom_music_path
|
535 |
-
else:
|
536 |
-
music_path = "bg_music.mp3"
|
537 |
-
|
538 |
-
if not os.path.exists(music_path):
|
539 |
-
print(f"[ERROR] Background music file not found: {music_path}")
|
540 |
-
return spoken # Return spoken audio without background music
|
541 |
-
|
542 |
-
try:
|
543 |
-
bg_music = AudioSegment.from_file(music_path, format="mp3")
|
544 |
-
except Exception as e:
|
545 |
-
print("[ERROR] Failed to load background music:", e)
|
546 |
-
return spoken
|
547 |
-
|
548 |
-
bg_music = bg_music - 18.0
|
549 |
-
total_length_ms = len(spoken) + 2000
|
550 |
-
looped_music = AudioSegment.empty()
|
551 |
-
while len(looped_music) < total_length_ms:
|
552 |
-
looped_music += bg_music
|
553 |
-
looped_music = looped_music[:total_length_ms]
|
554 |
-
final_mix = looped_music.overlay(spoken, position=2000)
|
555 |
-
return final_mix
|
556 |
-
|
557 |
-
###############################################################################
|
558 |
-
# Generate Script Function and Helper
|
559 |
-
###############################################################################
|
560 |
-
|
561 |
-
def generate_script(
|
562 |
-
system_prompt: str,
|
563 |
-
input_text: str,
|
564 |
-
tone: str,
|
565 |
-
target_length: str,
|
566 |
-
host_name: str = "Jane",
|
567 |
-
guest_name: str = "John",
|
568 |
-
sponsor_style: str = "Separate Break",
|
569 |
-
sponsor_provided: bool = False
|
570 |
-
) -> Dialogue:
|
571 |
-
"""
|
572 |
-
Generates a podcast script using DeepSeek R1 via OpenRouter API.
|
573 |
-
|
574 |
-
Args:
|
575 |
-
system_prompt (str): System-level instructions for the LLM.
|
576 |
-
input_text (str): The main content or topic for the podcast.
|
577 |
-
tone (str): Desired tone of the podcast (e.g., Casual, Formal).
|
578 |
-
target_length (str): Desired length of the podcast (e.g., "3 Mins").
|
579 |
-
host_name (str, optional): Name of the host. Defaults to "Jane".
|
580 |
-
guest_name (str, optional): Name of the guest. Defaults to "John".
|
581 |
-
sponsor_style (str, optional): Style of sponsor integration. Defaults to "Separate Break".
|
582 |
-
sponsor_provided (bool, optional): Whether sponsor content is provided. Defaults to False.
|
583 |
-
|
584 |
-
Returns:
|
585 |
-
Dialogue: A Dialogue object containing dialogue items.
|
586 |
-
"""
|
587 |
-
# Build the user prompt with additional instructions
|
588 |
-
user_prompt = (
|
589 |
-
f"Topic: {input_text}\n"
|
590 |
-
f"Tone: {tone}\n"
|
591 |
-
f"Length: {target_length}\n"
|
592 |
-
f"Host: {host_name or 'Jane'}\n"
|
593 |
-
f"Guest: {guest_name or 'John'}\n"
|
594 |
-
)
|
595 |
-
if sponsor_provided:
|
596 |
-
user_prompt += f"Sponsor Style: {sponsor_style}\n"
|
597 |
-
|
598 |
-
# Call the DeepSeek API to generate the script
|
599 |
-
try:
|
600 |
-
response = call_deepseek_api_cached(
|
601 |
-
system_prompt=system_prompt,
|
602 |
-
user_prompt=user_prompt,
|
603 |
-
max_tokens=1500,
|
604 |
-
temperature=0.7
|
605 |
-
)
|
606 |
-
except Exception as e:
|
607 |
-
print(f"[ERROR] Failed to generate script: {e}")
|
608 |
-
raise
|
609 |
-
|
610 |
-
# Parse the response into DialogueItems
|
611 |
-
dialogue_items = parse_script_to_dialogue(response, host_name, guest_name)
|
612 |
-
|
613 |
-
return Dialogue(dialogue=dialogue_items)
|
614 |
-
|
615 |
-
def parse_script_to_dialogue(script_text: str, host_name: str, guest_name: str) -> List[DialogueItem]:
|
616 |
-
"""
|
617 |
-
Parses the script text into a list of DialogueItem objects.
|
618 |
-
|
619 |
-
Args:
|
620 |
-
script_text (str): The raw script text generated by the LLM.
|
621 |
-
host_name (str): Name of the host.
|
622 |
-
guest_name (str): Name of the guest.
|
623 |
-
|
624 |
-
Returns:
|
625 |
-
List[DialogueItem]: A list of DialogueItem objects.
|
626 |
-
"""
|
627 |
-
# Define a regex pattern to identify lines like "HostName: Dialogue"
|
628 |
-
pattern = rf"(?i)\b({re.escape(host_name)}|{re.escape(guest_name)})\b:\s*(.*)"
|
629 |
-
matches = re.findall(pattern, script_text)
|
630 |
-
|
631 |
-
dialogue_items = []
|
632 |
-
for speaker, text in matches:
|
633 |
-
speaker_normalized = "Jane" if speaker.lower() == host_name.lower() else "John"
|
634 |
-
item = DialogueItem(
|
635 |
-
speaker=speaker_normalized,
|
636 |
-
display_speaker=speaker,
|
637 |
-
text=text.strip()
|
638 |
-
)
|
639 |
-
dialogue_items.append(item)
|
640 |
-
return dialogue_items
|
641 |
-
|
642 |
-
###############################################################################
|
643 |
-
# Generate Audio MP3 Function
|
644 |
-
###############################################################################
|
645 |
-
|
646 |
-
def generate_audio_mp3(text: str, speaker: str) -> str:
|
647 |
-
"""
|
648 |
-
Generates and returns the actual MP3 file path.
|
649 |
-
Utilizes Deepgram for English (American) and Murf for other languages.
|
650 |
-
|
651 |
-
Args:
|
652 |
-
text (str): The text to convert to speech.
|
653 |
-
speaker (str): The speaker identifier (e.g., "John", "Jane").
|
654 |
-
|
655 |
-
Returns:
|
656 |
-
str: The file path to the generated MP3 audio.
|
657 |
-
"""
|
658 |
-
try:
|
659 |
-
import streamlit as st
|
660 |
-
print(f"[LOG] Generating audio for speaker: {speaker}")
|
661 |
-
language_selection = st.session_state.get("language_selection", "English (American)")
|
662 |
-
|
663 |
-
if language_selection == "English (American)":
|
664 |
-
print("[LOG] Using Deepgram TTS for English (American)")
|
665 |
-
# Process text if speaker is not Jane
|
666 |
-
if speaker in ["John", "Jane"]:
|
667 |
-
processed_text = text
|
668 |
-
else:
|
669 |
-
processed_text = _preprocess_text_for_tts(text, speaker)
|
670 |
-
|
671 |
-
deepgram_api_url = "https://api.deepgram.com/v1/speak"
|
672 |
-
params = {"model": "aura-asteria-en"}
|
673 |
-
if speaker == "John":
|
674 |
-
params["model"] = "aura-zeus-en"
|
675 |
-
|
676 |
-
headers = {
|
677 |
-
"Accept": "audio/mpeg",
|
678 |
-
"Content-Type": "application/json",
|
679 |
-
"Authorization": f"Token {os.environ.get('DEEPSEEK_API_KEY')}"
|
680 |
-
}
|
681 |
-
body = {"text": processed_text}
|
682 |
-
response = requests.post(deepgram_api_url, params=params, headers=headers, json=body, stream=True)
|
683 |
-
if response.status_code != 200:
|
684 |
-
raise ValueError(f"Deepgram TTS error: {response.status_code}, {response.text}")
|
685 |
-
|
686 |
-
content_type = response.headers.get('Content-Type', '')
|
687 |
-
if 'audio/mpeg' not in content_type:
|
688 |
-
raise ValueError("Unexpected Content-Type from Deepgram.")
|
689 |
-
|
690 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as mp3_file:
|
691 |
-
for chunk in response.iter_content(chunk_size=8192):
|
692 |
-
if chunk:
|
693 |
-
mp3_file.write(chunk)
|
694 |
-
mp3_path = mp3_file.name
|
695 |
-
|
696 |
-
if not os.path.exists(mp3_path):
|
697 |
-
raise FileNotFoundError(f"Deepgram did not create the MP3 file: {mp3_path}")
|
698 |
-
|
699 |
-
audio_seg = AudioSegment.from_file(mp3_path, format="mp3")
|
700 |
-
audio_seg = effects.normalize(audio_seg)
|
701 |
-
final_mp3_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
|
702 |
-
audio_seg.export(final_mp3_path, format="mp3")
|
703 |
-
if os.path.exists(mp3_path):
|
704 |
-
os.remove(mp3_path)
|
705 |
-
|
706 |
-
print(f"[DEBUG] Generated MP3 Path: {final_mp3_path}")
|
707 |
-
if not os.path.exists(final_mp3_path):
|
708 |
-
raise FileNotFoundError(f"Final MP3 file was not created: {final_mp3_path}")
|
709 |
-
|
710 |
-
return final_mp3_path
|
711 |
-
|
712 |
-
else:
|
713 |
-
print(f"[LOG] Using Murf API for language: {language_selection}")
|
714 |
-
# Process text if language is Hinglish or Hindi
|
715 |
-
if language_selection == "Hinglish":
|
716 |
-
from indic_transliteration.sanscript import transliterate, DEVANAGARI, IAST
|
717 |
-
text = transliterate(text, DEVANAGARI, IAST)
|
718 |
-
|
719 |
-
api_key = os.environ.get("MURF_API_KEY")
|
720 |
-
headers = {
|
721 |
-
"Content-Type": "application/json",
|
722 |
-
"Accept": "application/json",
|
723 |
-
"api-key": api_key
|
724 |
-
}
|
725 |
-
multi_native_locale = "hi-IN" if language_selection in ["Hinglish", "Hindi"] else "en-IN"
|
726 |
-
if language_selection == "English (Indian)":
|
727 |
-
voice_id = "en-IN-aarav" if speaker == "John" else "en-IN-isha"
|
728 |
-
elif language_selection in ["Hindi", "Hinglish"]:
|
729 |
-
voice_id = "hi-IN-kabir" if speaker == "John" else "hi-IN-shweta"
|
730 |
-
else:
|
731 |
-
voice_id = "en-IN-aarav" if speaker == "John" else "en-IN-isha"
|
732 |
-
|
733 |
-
payload = {
|
734 |
-
"audioDuration": 0,
|
735 |
-
"channelType": "MONO",
|
736 |
-
"encodeAsBase64": False,
|
737 |
-
"format": "WAV",
|
738 |
-
"modelVersion": "GEN2",
|
739 |
-
"multiNativeLocale": multi_native_locale,
|
740 |
-
"pitch": 0,
|
741 |
-
"pronunciationDictionary": {},
|
742 |
-
"rate": 0,
|
743 |
-
"sampleRate": 48000,
|
744 |
-
"style": "Conversational",
|
745 |
-
"text": text,
|
746 |
-
"variation": 1,
|
747 |
-
"voiceId": voice_id
|
748 |
-
}
|
749 |
-
response = requests.post("https://api.murf.ai/v1/speech/generate", headers=headers, json=payload)
|
750 |
-
if response.status_code != 200:
|
751 |
-
raise ValueError(f"Murf API error: {response.status_code}, {response.text}")
|
752 |
-
|
753 |
-
json_resp = response.json()
|
754 |
-
audio_url = json_resp.get("audioFile")
|
755 |
-
if not audio_url:
|
756 |
-
raise ValueError("No audio file URL returned by Murf API")
|
757 |
-
|
758 |
-
audio_response = requests.get(audio_url)
|
759 |
-
if audio_response.status_code != 200:
|
760 |
-
raise ValueError(f"Error fetching audio from {audio_url}")
|
761 |
-
|
762 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as wav_file:
|
763 |
-
wav_file.write(audio_response.content)
|
764 |
-
wav_path = wav_file.name
|
765 |
-
|
766 |
-
if not os.path.exists(wav_path):
|
767 |
-
raise FileNotFoundError(f"Murf did not create the WAV file: {wav_path}")
|
768 |
-
|
769 |
-
audio_seg = AudioSegment.from_file(wav_path, format="wav")
|
770 |
-
audio_seg = effects.normalize(audio_seg)
|
771 |
-
final_mp3_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
|
772 |
-
audio_seg.export(final_mp3_path, format="mp3")
|
773 |
-
os.remove(wav_path)
|
774 |
-
|
775 |
-
if not os.path.exists(final_mp3_path):
|
776 |
-
raise FileNotFoundError(f"Final MP3 file was not created: {final_mp3_path}")
|
777 |
-
|
778 |
-
print(f"[DEBUG] Generated MP3 Path: {final_mp3_path}")
|
779 |
-
return final_mp3_path
|
780 |
-
|
781 |
-
def _preprocess_text_for_tts(text: str, speaker: str) -> str:
|
782 |
-
"""
|
783 |
-
Preprocesses text for Text-to-Speech conversion by adding pauses, fillers,
|
784 |
-
and handling specific cases to make the speech sound more natural.
|
785 |
-
|
786 |
-
Args:
|
787 |
-
text (str): The original text to preprocess.
|
788 |
-
speaker (str): The speaker identifier (e.g., "John", "Jane").
|
789 |
-
|
790 |
-
Returns:
|
791 |
-
str: The preprocessed text.
|
792 |
-
"""
|
793 |
-
# Unchanged logic for adding filler words, etc.
|
794 |
-
text = re.sub(r"\bNo\.\b", "Number", text)
|
795 |
-
text = re.sub(r"\b(?i)SaaS\b", "sass", text)
|
796 |
-
abbreviations_as_words = {"NASA", "NATO", "UNESCO"}
|
797 |
-
|
798 |
-
def insert_periods_for_abbrev(m):
|
799 |
-
abbr = m.group(0)
|
800 |
-
if abbr in abbreviations_as_words:
|
801 |
-
return abbr
|
802 |
-
return ".".join(list(abbr)) + "."
|
803 |
-
|
804 |
-
text = re.sub(r"\b([A-Z]{2,})\b", insert_periods_for_abbrev, text)
|
805 |
-
text = re.sub(r"\.\.", ".", text)
|
806 |
-
|
807 |
-
def remove_periods_for_tts(m):
|
808 |
-
return m.group().replace(".", " ").strip()
|
809 |
-
|
810 |
-
text = re.sub(r"[A-Z]\.[A-Z](?:\.[A-Z])*\.", remove_periods_for_tts, text)
|
811 |
-
text = re.sub(r"-", " ", text)
|
812 |
-
text = re.sub(r"\b(ha(ha)?|heh|lol)\b", "(* laughs *)", text, flags=re.IGNORECASE)
|
813 |
-
text = re.sub(r"\bsigh\b", "(* sighs *)", text, flags=re.IGNORECASE)
|
814 |
-
text = re.sub(r"\b(groan|moan)\b", "(* groans *)", text, flags=re.IGNORECASE)
|
815 |
-
|
816 |
-
if speaker != "Jane":
|
817 |
-
def insert_thinking_pause(m):
|
818 |
-
word = m.group(1)
|
819 |
-
if random.random() < 0.3:
|
820 |
-
filler = random.choice(['hmm,', 'well,', 'let me see,'])
|
821 |
-
return f"{word}..., {filler}"
|
822 |
-
else:
|
823 |
-
return f"{word}...,"
|
824 |
-
|
825 |
-
keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
|
826 |
-
text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
|
827 |
-
|
828 |
-
conj_pattern = r"\b(and|but|so|because|however)\b"
|
829 |
-
text = re.sub(conj_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
|
830 |
-
|
831 |
-
text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
|
832 |
-
|
833 |
-
def capitalize_match(m):
|
834 |
-
return m.group().upper()
|
835 |
-
|
836 |
-
text = re.sub(r'(^\s*\w)|([.!?]\s*\w)', capitalize_match, text)
|
837 |
-
return text.strip()
|
838 |
-
|
839 |
-
###############################################################################
|
840 |
-
# Unified aggregator: google + bing + wiki + rss + event registry + fallback
|
841 |
-
###############################################################################
|
842 |
-
|
843 |
def perform_deep_research(topic: str) -> str:
|
844 |
-
|
845 |
-
Perform deep research by aggregating data from multiple sources.
|
846 |
-
Limits the number of sources to prevent exceeding token limits.
|
847 |
-
Summarizes each source's content to reduce token count.
|
848 |
-
|
849 |
-
Args:
|
850 |
-
topic (str): The research topic.
|
851 |
-
|
852 |
-
Returns:
|
853 |
-
str: The final professional report in Markdown format.
|
854 |
-
"""
|
855 |
-
# Define the maximum number of sources per aggregator
|
856 |
-
MAX_SOURCES_PER_AGGREGATOR = 5
|
857 |
-
|
858 |
-
# Step 1: Google
|
859 |
google_cse_id = os.environ.get("GOOGLE_CSE_ID")
|
860 |
google_api_key = os.environ.get("GOOGLE_API_KEY")
|
861 |
google_sources = []
|
@@ -867,12 +200,12 @@ def perform_deep_research(topic: str) -> str:
|
|
867 |
"q": topic,
|
868 |
"cx": google_cse_id,
|
869 |
"key": google_api_key,
|
870 |
-
"num":
|
871 |
}
|
872 |
resp = requests.get(url, params=params, timeout=15)
|
873 |
resp.raise_for_status()
|
874 |
data = resp.json()
|
875 |
-
items = data.get("items", [])
|
876 |
for it in items:
|
877 |
google_sources.append({
|
878 |
"title": it.get("title", ""),
|
@@ -881,11 +214,7 @@ def perform_deep_research(topic: str) -> str:
|
|
881 |
})
|
882 |
except Exception as e:
|
883 |
print("[ERROR] Google approach failed:", e)
|
884 |
-
|
885 |
-
# Step 2: Bing
|
886 |
-
bing_results = fetch_bing_results(topic, count=10)[:MAX_SOURCES_PER_AGGREGATOR]
|
887 |
-
|
888 |
-
# Step 3: Wikipedia summary
|
889 |
wiki_summary_text = fetch_wikipedia_summary(topic)
|
890 |
wiki_item = None
|
891 |
if wiki_summary_text:
|
@@ -894,9 +223,6 @@ def perform_deep_research(topic: str) -> str:
|
|
894 |
"link": f"https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}",
|
895 |
"snippet": wiki_summary_text
|
896 |
}
|
897 |
-
|
898 |
-
# Step 4: RSS approach (NewsAPI assumed here)
|
899 |
-
rss_sources = []
|
900 |
sources_dict = {
|
901 |
"BBC": "https://feeds.bbci.co.uk/news/rss.xml",
|
902 |
"CNN": "http://rss.cnn.com/rss/edition.rss",
|
@@ -906,7 +232,8 @@ def perform_deep_research(topic: str) -> str:
|
|
906 |
"The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
|
907 |
"Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
|
908 |
}
|
909 |
-
|
|
|
910 |
try:
|
911 |
items = fetch_rss_feed(feed_url)
|
912 |
if not items:
|
@@ -929,11 +256,7 @@ def perform_deep_research(topic: str) -> str:
|
|
929 |
except Exception as e:
|
930 |
print(f"[ERROR] Error fetching from {name} RSS feed:", e)
|
931 |
continue
|
932 |
-
|
933 |
-
# Step 5: Event Registry
|
934 |
-
event_registry_res = fetch_eventregistry_articles(topic, count=10)[:MAX_SOURCES_PER_AGGREGATOR]
|
935 |
-
|
936 |
-
# Combine all sources
|
937 |
combined = []
|
938 |
combined.extend(google_sources)
|
939 |
combined.extend(bing_results)
|
@@ -941,10 +264,8 @@ def perform_deep_research(topic: str) -> str:
|
|
941 |
combined.append(wiki_item)
|
942 |
combined.extend(rss_sources)
|
943 |
combined.extend(event_registry_res)
|
944 |
-
|
945 |
if not combined:
|
946 |
print("[LOG] No results found from aggregator. Using LLM fallback.")
|
947 |
-
# LLM-based fallback
|
948 |
fallback_text = query_llm_for_additional_info(topic, "")
|
949 |
cleaned_fb = rewrite_in_professional_style(topic, fallback_text)
|
950 |
fallback_data = [{
|
@@ -955,22 +276,231 @@ def perform_deep_research(topic: str) -> str:
|
|
955 |
}]
|
956 |
return _draft_professional_report(topic, fallback_data)
|
957 |
else:
|
958 |
-
|
959 |
-
|
960 |
-
for
|
961 |
-
|
962 |
-
|
963 |
-
|
964 |
-
|
965 |
-
|
966 |
-
|
967 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
968 |
|
969 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
970 |
|
971 |
-
|
972 |
-
#
|
973 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
974 |
|
975 |
def _spell_digits(d: str) -> str:
|
976 |
digit_map = {
|
@@ -980,6 +510,21 @@ def _spell_digits(d: str) -> str:
|
|
980 |
}
|
981 |
return " ".join(digit_map[ch] for ch in d if ch in digit_map)
|
982 |
|
983 |
-
|
984 |
-
|
985 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
|
|
2 |
import re
|
3 |
+
import json
|
4 |
import requests
|
5 |
import tempfile
|
|
|
|
|
|
|
|
|
6 |
from bs4 import BeautifulSoup
|
7 |
+
from typing import List, Literal
|
8 |
+
from pydantic import BaseModel
|
9 |
from pydub import AudioSegment, effects
|
10 |
+
from transformers import pipeline
|
11 |
+
import yt_dlp
|
12 |
import tiktoken
|
13 |
+
import numpy as np
|
14 |
+
import torch
|
|
|
15 |
import random
|
16 |
+
import base64
|
17 |
+
from io import BytesIO
|
18 |
+
import pdfkit
|
19 |
+
import markdown # For Markdown to HTML conversion
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
+
# ------------------------------
|
22 |
+
# Data models
|
23 |
+
# ------------------------------
|
24 |
class DialogueItem(BaseModel):
|
25 |
+
speaker: Literal["Jane", "John"]
|
26 |
+
display_speaker: str = "Jane"
|
27 |
text: str
|
28 |
|
29 |
class Dialogue(BaseModel):
|
30 |
dialogue: List[DialogueItem]
|
31 |
|
32 |
+
# ------------------------------
|
33 |
+
# ASR Pipeline setup
|
34 |
+
# ------------------------------
|
35 |
+
asr_pipeline = pipeline(
|
36 |
+
"automatic-speech-recognition",
|
37 |
+
model="openai/whisper-tiny.en",
|
38 |
+
device=0 if torch.cuda.is_available() else -1
|
39 |
+
)
|
40 |
+
|
41 |
+
# ------------------------------
|
42 |
+
# Helper functions
|
43 |
+
# ------------------------------
|
44 |
+
def truncate_text(text, max_tokens=2048):
|
45 |
+
print("[LOG] Truncating text if needed.")
|
46 |
+
tokenizer = tiktoken.get_encoding("cl100k_base")
|
47 |
+
tokens = tokenizer.encode(text)
|
48 |
+
if len(tokens) > max_tokens:
|
49 |
+
print("[LOG] Text too long, truncating.")
|
50 |
+
return tokenizer.decode(tokens[:max_tokens])
|
51 |
+
return text
|
52 |
+
|
53 |
+
def extract_text_from_url(url):
|
54 |
print("[LOG] Extracting text from URL:", url)
|
55 |
try:
|
56 |
headers = {
|
|
|
74 |
print(f"[ERROR] Exception during text extraction from URL: {e}")
|
75 |
return ""
|
76 |
|
77 |
+
def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
|
78 |
+
print(f"[LOG] Shifting pitch by {semitones} semitones.")
|
79 |
+
new_sample_rate = int(audio.frame_rate * (2.0 ** (semitones / 12.0)))
|
80 |
+
shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
|
81 |
+
return shifted_audio.set_frame_rate(audio.frame_rate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
+
def is_sufficient(text: str, min_word_count: int = 500) -> bool:
|
84 |
+
word_count = len(text.split())
|
85 |
+
print(f"[DEBUG] Aggregated word count: {word_count}")
|
86 |
+
return word_count >= min_word_count
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
+
# ------------------------------
|
89 |
+
# Text rewriting using DeepSeek (via OpenRouter)
|
90 |
+
# ------------------------------
|
91 |
+
def rewrite_in_professional_style(topic: str, raw_text: str) -> str:
|
92 |
+
if not raw_text.strip():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
return ""
|
94 |
+
system_prompt = (
|
95 |
+
"You are a professional writing assistant. Your goal is to rewrite "
|
96 |
+
"the provided text so that it is:\n"
|
97 |
+
"1) Written in clear, fluent, professional English\n"
|
98 |
+
f"2) On-topic about {topic}, removing any extraneous disclaimers or filler\n"
|
99 |
+
"3) Organized in paragraphs or bullet points\n"
|
100 |
+
"4) Maintained or slightly enhanced in detail without significant summarization\n"
|
101 |
+
"5) No references to the rewriting process or disclaimers\n"
|
102 |
+
)
|
103 |
+
user_prompt = f"Please rewrite this text:\n\n{raw_text}"
|
104 |
try:
|
105 |
+
response = call_deepseek_api(
|
106 |
+
system_prompt=system_prompt,
|
107 |
+
user_prompt=user_prompt,
|
108 |
+
max_tokens=1024,
|
109 |
+
temperature=0.7
|
110 |
+
)
|
111 |
+
return response.strip()
|
|
|
|
|
112 |
except Exception as e:
|
113 |
+
print("[ERROR] rewriting text via Deepseek LLM failed:", e)
|
114 |
+
return raw_text
|
115 |
|
116 |
+
# ------------------------------
|
117 |
+
# Event Registry aggregator
|
118 |
+
# ------------------------------
|
119 |
def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
|
|
|
|
|
|
|
|
|
|
|
120 |
news_api_key = os.environ.get("NEWS_API_KEY")
|
121 |
if not news_api_key:
|
122 |
print("[ERROR] Missing NEWS_API_KEY for Event Registry.")
|
123 |
return []
|
|
|
124 |
print("[LOG] Attempting Event Registry for topic:", topic)
|
125 |
endpoint = "https://eventregistry.org/api/v1/article/getArticles"
|
|
|
126 |
body = {
|
127 |
"action": "getArticles",
|
128 |
"keyword": topic,
|
129 |
"articlesPage": 1,
|
130 |
+
"articlesCount": count,
|
131 |
"articlesSortBy": "date",
|
132 |
"articlesSortByAsc": False,
|
133 |
"dataType": ["news", "pr"],
|
134 |
+
"forceMaxDataTimeWindow": 31,
|
135 |
"resultType": "articles",
|
136 |
"apiKey": news_api_key
|
137 |
}
|
|
|
138 |
try:
|
139 |
resp = requests.post(endpoint, json=body, timeout=20)
|
140 |
resp.raise_for_status()
|
141 |
data = resp.json()
|
|
|
142 |
art_data = data.get("articles", {})
|
143 |
results_arr = art_data.get("results", [])
|
|
|
144 |
ret = []
|
145 |
for item in results_arr:
|
|
|
146 |
title = item.get("title", "")
|
147 |
url = item.get("url", "")
|
|
|
148 |
snippet = item.get("body", "") or item.get("excerpt", "")
|
149 |
ret.append({"title": title, "link": url, "snippet": snippet})
|
150 |
return ret
|
|
|
152 |
print("[ERROR] Event Registry approach failed:", e)
|
153 |
return []
|
154 |
|
155 |
+
# ------------------------------
|
156 |
+
# Bing results via SerpApi
|
157 |
+
# ------------------------------
|
158 |
def fetch_bing_results(query: str, count: int = 10) -> list:
|
159 |
serp_api_key = os.environ.get("SERP_API_KEY")
|
160 |
if not serp_api_key:
|
|
|
184 |
print("[ERROR] Bing SerpApi approach failed:", e)
|
185 |
return []
|
186 |
|
187 |
+
# ------------------------------
|
188 |
+
# Unified deep research aggregator
|
189 |
+
# ------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
def perform_deep_research(topic: str) -> str:
|
191 |
+
# Limit each source to a maximum of 5 items
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
google_cse_id = os.environ.get("GOOGLE_CSE_ID")
|
193 |
google_api_key = os.environ.get("GOOGLE_API_KEY")
|
194 |
google_sources = []
|
|
|
200 |
"q": topic,
|
201 |
"cx": google_cse_id,
|
202 |
"key": google_api_key,
|
203 |
+
"num": 5
|
204 |
}
|
205 |
resp = requests.get(url, params=params, timeout=15)
|
206 |
resp.raise_for_status()
|
207 |
data = resp.json()
|
208 |
+
items = data.get("items", [])
|
209 |
for it in items:
|
210 |
google_sources.append({
|
211 |
"title": it.get("title", ""),
|
|
|
214 |
})
|
215 |
except Exception as e:
|
216 |
print("[ERROR] Google approach failed:", e)
|
217 |
+
bing_results = fetch_bing_results(topic, count=5)
|
|
|
|
|
|
|
|
|
218 |
wiki_summary_text = fetch_wikipedia_summary(topic)
|
219 |
wiki_item = None
|
220 |
if wiki_summary_text:
|
|
|
223 |
"link": f"https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}",
|
224 |
"snippet": wiki_summary_text
|
225 |
}
|
|
|
|
|
|
|
226 |
sources_dict = {
|
227 |
"BBC": "https://feeds.bbci.co.uk/news/rss.xml",
|
228 |
"CNN": "http://rss.cnn.com/rss/edition.rss",
|
|
|
232 |
"The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
|
233 |
"Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
|
234 |
}
|
235 |
+
rss_sources = []
|
236 |
+
for name, feed_url in sources_dict.items():
|
237 |
try:
|
238 |
items = fetch_rss_feed(feed_url)
|
239 |
if not items:
|
|
|
256 |
except Exception as e:
|
257 |
print(f"[ERROR] Error fetching from {name} RSS feed:", e)
|
258 |
continue
|
259 |
+
event_registry_res = fetch_eventregistry_articles(topic, count=5)
|
|
|
|
|
|
|
|
|
260 |
combined = []
|
261 |
combined.extend(google_sources)
|
262 |
combined.extend(bing_results)
|
|
|
264 |
combined.append(wiki_item)
|
265 |
combined.extend(rss_sources)
|
266 |
combined.extend(event_registry_res)
|
|
|
267 |
if not combined:
|
268 |
print("[LOG] No results found from aggregator. Using LLM fallback.")
|
|
|
269 |
fallback_text = query_llm_for_additional_info(topic, "")
|
270 |
cleaned_fb = rewrite_in_professional_style(topic, fallback_text)
|
271 |
fallback_data = [{
|
|
|
276 |
}]
|
277 |
return _draft_professional_report(topic, fallback_data)
|
278 |
else:
|
279 |
+
final_list = []
|
280 |
+
idx = 0
|
281 |
+
for source in combined:
|
282 |
+
idx += 1
|
283 |
+
link = source.get("link", "")
|
284 |
+
snippet = source.get("snippet", "")
|
285 |
+
title = source.get("title", "")
|
286 |
+
cleaned_text = rewrite_in_professional_style(topic, snippet)
|
287 |
+
if cleaned_text.strip():
|
288 |
+
final_list.append({
|
289 |
+
"index": idx,
|
290 |
+
"title": title,
|
291 |
+
"link": link,
|
292 |
+
"cleaned_text": cleaned_text
|
293 |
+
})
|
294 |
+
if not final_list:
|
295 |
+
print("[LOG] Aggregator produced no final content after rewriting. Using LLM fallback.")
|
296 |
+
fallback_text = query_llm_for_additional_info(topic, "")
|
297 |
+
cleaned_fb = rewrite_in_professional_style(topic, fallback_text)
|
298 |
+
fallback_data = [{
|
299 |
+
"index": 1,
|
300 |
+
"title": "Fallback Info",
|
301 |
+
"link": "N/A",
|
302 |
+
"cleaned_text": cleaned_fb
|
303 |
+
}]
|
304 |
+
return _draft_professional_report(topic, fallback_data)
|
305 |
+
return _draft_professional_report(topic, final_list)
|
306 |
+
|
307 |
+
def _draft_professional_report(topic: str, sources_list: list) -> str:
|
308 |
+
merged_text = []
|
309 |
+
footnotes = []
|
310 |
+
for s in sources_list:
|
311 |
+
footnotes.append(f"[^{s['index']}]: {s['link']}")
|
312 |
+
text_block = (
|
313 |
+
f"Source {s['index']} Title: {s['title']}\n"
|
314 |
+
f"FootnoteRef: [^{s['index']}]\n"
|
315 |
+
f"Text:\n{s['cleaned_text']}\n"
|
316 |
+
)
|
317 |
+
merged_text.append(text_block)
|
318 |
+
all_content = "\n\n".join(merged_text)
|
319 |
+
system_prompt = f"""You are a highly skilled professional research analyst.
|
320 |
+
You have access to multiple authoritative sources on the topic: {topic}.
|
321 |
+
Your task is to produce a comprehensive and detailed formal research report that includes the following sections:
|
322 |
+
|
323 |
+
1. **Title:** Use the topic as the title of the report.
|
324 |
+
2. **Executive Summary:** Provide a concise overview highlighting the key findings and insights.
|
325 |
+
3. **Introduction:** Introduce the topic, its relevance, and the scope of the report.
|
326 |
+
4. **Main Body:**
|
327 |
+
- **Sub-heading 1:** Summarize insights from Source 1.
|
328 |
+
- **Sub-heading 2:** Summarize insights from Source 2.
|
329 |
+
- *(Continue as needed for all sources)*
|
330 |
+
- **Analysis:** Provide an in-depth analysis combining information from all sources.
|
331 |
+
5. **Conclusion:** Present final thoughts, implications, and potential future directions.
|
332 |
+
6. **References:** List all sources with numeric footnote markers [^1], [^2], etc.
|
333 |
+
|
334 |
+
**Requirements:**
|
335 |
+
- **Length:** The report must be at least **1,000 words** in total.
|
336 |
+
- **Content Quality:**
|
337 |
+
- Incorporate relevant facts, figures, and statistics.
|
338 |
+
- Use professional and clear language.
|
339 |
+
- Ensure each section is well-developed without unnecessary repetition.
|
340 |
+
- **Structure:** Maintain a logical and cohesive flow throughout the report.
|
341 |
+
- **Formatting:** Use proper formatting for headings, sub-headings, and references.
|
342 |
+
|
343 |
+
**Below is the aggregated content from your sources (with footnote references):**
|
344 |
+
-----------------------------------------------------------------------
|
345 |
+
{all_content}
|
346 |
+
-----------------------------------------------------------------------
|
347 |
+
**Footnotes:**
|
348 |
+
{chr(10).join(footnotes)}
|
349 |
+
"""
|
350 |
+
def count_tokens(text: str) -> int:
|
351 |
+
tokenizer = tiktoken.get_encoding("cl100k_base")
|
352 |
+
tokens = tokenizer.encode(text)
|
353 |
+
return len(tokens)
|
354 |
+
max_tokens = 6000
|
355 |
+
system_prompt_tokens = count_tokens(system_prompt)
|
356 |
+
all_content_tokens = count_tokens(all_content)
|
357 |
+
total_tokens = system_prompt_tokens + all_content_tokens
|
358 |
+
print(f"[DEBUG] Total tokens before optimization: {total_tokens}")
|
359 |
+
if total_tokens > max_tokens:
|
360 |
+
allowed_tokens_for_content = max_tokens - system_prompt_tokens - 100
|
361 |
+
if allowed_tokens_for_content <= 0:
|
362 |
+
print("[ERROR] System prompt alone exceeds the token limit.")
|
363 |
+
return "The system prompt exceeds the token limit. Please reduce the complexity of your research."
|
364 |
+
tokenizer = tiktoken.get_encoding("cl100k_base")
|
365 |
+
all_content_tokens_list = tokenizer.encode(all_content)
|
366 |
+
truncated_tokens = all_content_tokens_list[:allowed_tokens_for_content]
|
367 |
+
truncated_content = tokenizer.decode(truncated_tokens)
|
368 |
+
system_prompt = system_prompt.replace(all_content, truncated_content + "\n\n[Content truncated to fit token limits.]")
|
369 |
+
print(f"[DEBUG] Truncated content to fit token limits: {len(truncated_tokens)} tokens")
|
370 |
+
try:
|
371 |
+
response = call_deepseek_api(
|
372 |
+
system_prompt=system_prompt,
|
373 |
+
user_prompt="",
|
374 |
+
max_tokens=3000,
|
375 |
+
temperature=0.7
|
376 |
+
)
|
377 |
+
final_report = response.strip()
|
378 |
+
word_count = len(final_report.split())
|
379 |
+
if word_count < 1000:
|
380 |
+
print(f"[WARNING] Generated report is below desired length: {word_count} words.")
|
381 |
+
return final_report
|
382 |
+
except Exception as e:
|
383 |
+
print("[ERROR] Could not finalize professional report:", e)
|
384 |
+
return "An unexpected error occurred. Please try again later."
|
385 |
+
|
386 |
+
def call_deepseek_api(system_prompt: str, user_prompt: str, max_tokens: int, temperature: float) -> str:
|
387 |
+
print("[LOG] Communicating with DeepSeek R1 via OpenRouter API.")
|
388 |
+
try:
|
389 |
+
headers = {
|
390 |
+
"Authorization": f"Bearer {os.environ.get('DEEPSEEK_API_KEY')}",
|
391 |
+
"HTTP-Referer": "https://yourdomain.com", # Replace with your site URL if needed
|
392 |
+
"X-Title": "MyPod", # Replace with your site name if needed
|
393 |
+
"Content-Type": "application/json"
|
394 |
+
}
|
395 |
+
data = {
|
396 |
+
"model": "deepseek/deepseek-r1:free",
|
397 |
+
"messages": [
|
398 |
+
{"role": "system", "content": system_prompt},
|
399 |
+
{"role": "user", "content": user_prompt}
|
400 |
+
],
|
401 |
+
"max_tokens": max_tokens,
|
402 |
+
"temperature": temperature
|
403 |
+
}
|
404 |
+
response = requests.post("https://openrouter.ai/api/v1/chat/completions",
|
405 |
+
headers=headers, data=json.dumps(data))
|
406 |
+
response.raise_for_status()
|
407 |
+
json_response = response.json()
|
408 |
+
if "choices" not in json_response:
|
409 |
+
raise ValueError("Invalid response from OpenRouter API: 'choices' key missing.")
|
410 |
+
return json_response["choices"][0]["message"]["content"]
|
411 |
+
except requests.exceptions.HTTPError as e:
|
412 |
+
status_code = e.response.status_code
|
413 |
+
if status_code == 503:
|
414 |
+
print("[ERROR] Service Unavailable from Deepseek API.")
|
415 |
+
raise ValueError("Service is currently unavailable. Please try again later.")
|
416 |
+
elif status_code == 413:
|
417 |
+
print("[ERROR] Request too large for Deepseek API.")
|
418 |
+
raise ValueError("The request is too large. Please reduce the input size and try again.")
|
419 |
+
else:
|
420 |
+
print("[ERROR] Deepseek API error:", e)
|
421 |
+
raise ValueError("An error occurred while generating the report. Please try again later.")
|
422 |
+
except Exception as e:
|
423 |
+
print("[ERROR] Could not communicate with Deepseek API:", e)
|
424 |
+
raise ValueError("An unexpected error occurred. Please try again later.")
|
425 |
+
|
426 |
+
def generate_pdf_from_markdown(markdown_text: str) -> bytes:
|
427 |
+
try:
|
428 |
+
html = markdown.markdown(markdown_text, extensions=["extra", "tables", "toc"])
|
429 |
+
pdf_bytes = pdfkit.from_string(html, False)
|
430 |
+
return pdf_bytes
|
431 |
+
except Exception as e:
|
432 |
+
print(f"[ERROR] Failed to generate PDF from Markdown: {e}")
|
433 |
+
return b""
|
434 |
+
|
435 |
+
def fetch_wikipedia_summary(topic: str) -> str:
|
436 |
+
print("[LOG] Fetching Wikipedia summary for:", topic)
|
437 |
+
try:
|
438 |
+
search_url = (
|
439 |
+
f"https://en.wikipedia.org/w/api.php?action=opensearch&search={requests.utils.quote(topic)}"
|
440 |
+
"&limit=1&namespace=0&format=json"
|
441 |
+
)
|
442 |
+
resp = requests.get(search_url)
|
443 |
+
if resp.status_code != 200:
|
444 |
+
print(f"[ERROR] Failed to fetch Wikipedia search results for {topic}")
|
445 |
+
return ""
|
446 |
+
data = resp.json()
|
447 |
+
if len(data) > 1 and data[1]:
|
448 |
+
title = data[1][0]
|
449 |
+
summary_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(title)}"
|
450 |
+
s_resp = requests.get(summary_url)
|
451 |
+
if s_resp.status_code == 200:
|
452 |
+
s_data = s_resp.json()
|
453 |
+
if "extract" in s_data:
|
454 |
+
print("[LOG] Wikipedia summary fetched successfully.")
|
455 |
+
return s_data["extract"]
|
456 |
+
return ""
|
457 |
+
except Exception as e:
|
458 |
+
print(f"[ERROR] Exception during Wikipedia summary fetch: {e}")
|
459 |
+
return ""
|
460 |
+
|
461 |
+
def fetch_rss_feed(feed_url: str) -> list:
|
462 |
+
print("[LOG] Fetching RSS feed:", feed_url)
|
463 |
+
try:
|
464 |
+
resp = requests.get(feed_url)
|
465 |
+
if resp.status_code != 200:
|
466 |
+
print(f"[ERROR] Failed to fetch RSS feed: {feed_url}")
|
467 |
+
return []
|
468 |
+
soup = BeautifulSoup(resp.content, "xml")
|
469 |
+
items = soup.find_all("item")
|
470 |
+
return items
|
471 |
+
except Exception as e:
|
472 |
+
print(f"[ERROR] Exception fetching RSS feed {feed_url}: {e}")
|
473 |
+
return []
|
474 |
|
475 |
+
def find_relevant_article(items, topic: str, min_match=2) -> tuple:
|
476 |
+
print("[LOG] Finding relevant articles...")
|
477 |
+
keywords = re.findall(r'\w+', topic.lower())
|
478 |
+
for item in items:
|
479 |
+
title = item.find("title").get_text().strip() if item.find("title") else ""
|
480 |
+
description = item.find("description").get_text().strip() if item.find("description") else ""
|
481 |
+
text = (title + " " + description).lower()
|
482 |
+
matches = sum(1 for kw in keywords if kw in text)
|
483 |
+
if matches >= min_match:
|
484 |
+
link = item.find("link").get_text().strip() if item.find("link") else ""
|
485 |
+
print(f"[LOG] Relevant article found: {title}")
|
486 |
+
return title, description, link
|
487 |
+
return None, None, None
|
488 |
|
489 |
+
# ------------------------------
|
490 |
+
# Preprocess text for TTS
|
491 |
+
# ------------------------------
|
492 |
+
def _preprocess_text_for_tts(text: str, speaker: str) -> str:
|
493 |
+
text = re.sub(r"\bNo\.\b", "Number", text)
|
494 |
+
text = re.sub(r"\b(?i)SaaS\b", "sass", text)
|
495 |
+
abbreviations_as_words = {"NASA", "NATO", "UNESCO"}
|
496 |
+
def insert_periods_for_abbrev(m):
|
497 |
+
abbr = m.group(0)
|
498 |
+
if abbr in abbreviations_as_words:
|
499 |
+
return abbr
|
500 |
+
return ".".join(list(abbr)) + "."
|
501 |
+
text = re.sub(r"\b([A-Z]{2,})\b", insert_periods_for_abbrev, text)
|
502 |
+
text = re.sub(r"\.\.", ".", text)
|
503 |
+
return text
|
504 |
|
505 |
def _spell_digits(d: str) -> str:
|
506 |
digit_map = {
|
|
|
510 |
}
|
511 |
return " ".join(digit_map[ch] for ch in d if ch in digit_map)
|
512 |
|
513 |
+
def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
|
514 |
+
if custom_music_path:
|
515 |
+
music_path = custom_music_path
|
516 |
+
else:
|
517 |
+
music_path = "bg_music.mp3"
|
518 |
+
try:
|
519 |
+
bg_music = AudioSegment.from_file(music_path, format="mp3")
|
520 |
+
except Exception as e:
|
521 |
+
print("[ERROR] Failed to load background music:", e)
|
522 |
+
return spoken
|
523 |
+
bg_music = bg_music - 18.0
|
524 |
+
total_length_ms = len(spoken) + 2000
|
525 |
+
looped_music = AudioSegment.empty()
|
526 |
+
while len(looped_music) < total_length_ms:
|
527 |
+
looped_music += bg_music
|
528 |
+
looped_music = looped_music[:total_length_ms]
|
529 |
+
final_mix = looped_music.overlay(spoken, position=2000)
|
530 |
+
return final_mix
|