siddhartharyaai commited on
Commit
66b4c56
·
verified ·
1 Parent(s): fcd65f7

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +606 -358
utils.py CHANGED
@@ -1,50 +1,43 @@
1
  import os
2
- import re
3
  import json
 
4
  import requests
5
  import tempfile
6
- from bs4 import BeautifulSoup
7
- from typing import List, Literal
8
  from pydantic import BaseModel
 
9
  from pydub import AudioSegment, effects
10
- from transformers import pipeline
11
- import yt_dlp
12
  import tiktoken
13
- import numpy as np
14
- import torch
15
- import random
16
 
17
- import base64
18
- from io import BytesIO
19
- import pdfkit
20
- import markdown # Added for Markdown to HTML conversion
21
-
22
- # Define Dialogue Models
23
  class DialogueItem(BaseModel):
24
- speaker: Literal["Jane", "John"]
25
- display_speaker: str = "Jane"
26
  text: str
27
 
28
  class Dialogue(BaseModel):
29
  dialogue: List[DialogueItem]
30
 
31
- # Initialize ASR Pipeline (if used elsewhere)
32
- asr_pipeline = pipeline(
33
- "automatic-speech-recognition",
34
- model="openai/whisper-tiny.en",
35
- device=0 if torch.cuda.is_available() else -1
36
- )
37
-
38
- def truncate_text(text, max_tokens=2048):
39
- print("[LOG] Truncating text if needed.")
40
- tokenizer = tiktoken.get_encoding("cl100k_base")
41
- tokens = tokenizer.encode(text)
42
- if len(tokens) > max_tokens:
43
- print("[LOG] Text too long, truncating.")
44
- return tokenizer.decode(tokens[:max_tokens])
45
- return text
46
-
47
- def extract_text_from_url(url):
 
 
 
48
  print("[LOG] Extracting text from URL:", url)
49
  try:
50
  headers = {
@@ -68,51 +61,79 @@ def extract_text_from_url(url):
68
  print(f"[ERROR] Exception during text extraction from URL: {e}")
69
  return ""
70
 
71
- def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
72
- print(f"[LOG] Shifting pitch by {semitones} semitones.")
73
- new_sample_rate = int(audio.frame_rate * (2.0 ** (semitones / 12.0)))
74
- shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
75
- return shifted_audio.set_frame_rate(audio.frame_rate)
76
-
77
- def is_sufficient(text: str, min_word_count: int = 500) -> bool:
78
- word_count = len(text.split())
79
- print(f"[DEBUG] Aggregated word count: {word_count}")
80
- return word_count >= min_word_count
81
-
82
- ###############################################################################
83
- # Rewrites text in professional style
84
- ###############################################################################
85
- def rewrite_in_professional_style(topic: str, raw_text: str) -> str:
86
- if not raw_text.strip():
 
 
 
 
 
 
 
 
87
  return ""
88
 
89
- system_prompt = (
90
- "You are a professional writing assistant. Your goal is to rewrite "
91
- "the provided text so that it is:\n"
92
- "1) Written in clear, fluent, professional English\n"
93
- f"2) On-topic about {topic}, removing any extraneous disclaimers or filler\n"
94
- "3) Organized in paragraphs or bullet points\n"
95
- "4) Maintained or slightly enhanced in detail without significant summarization\n"
96
- "5) No references to the rewriting process or disclaimers\n"
97
- )
 
 
 
 
98
 
99
- user_prompt = f"Please rewrite this text:\n\n{raw_text}"
 
 
 
 
 
 
 
 
 
 
 
 
100
 
 
 
 
 
 
101
  try:
102
- response = call_deepseek_api(
103
- system_prompt=system_prompt,
104
- user_prompt=user_prompt,
105
- max_tokens=1024,
106
- temperature=0.7
107
- )
108
- return response.strip()
 
 
109
  except Exception as e:
110
- print("[ERROR] rewriting text via Deepseek LLM failed:", e)
111
- return raw_text
112
 
113
- ###############################################################################
114
- # Event Registry (News API) aggregator
115
- ###############################################################################
116
  def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
117
  """
118
  Query https://eventregistry.org/api/v1/article/getArticles
@@ -161,9 +182,6 @@ def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
161
  print("[ERROR] Event Registry approach failed:", e)
162
  return []
163
 
164
- ###############################################################################
165
- # Bing via SerpApi
166
- ###############################################################################
167
  def fetch_bing_results(query: str, count: int = 10) -> list:
168
  serp_api_key = os.environ.get("SERP_API_KEY")
169
  if not serp_api_key:
@@ -193,18 +211,103 @@ def fetch_bing_results(query: str, count: int = 10) -> list:
193
  print("[ERROR] Bing SerpApi approach failed:", e)
194
  return []
195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  ###############################################################################
197
  # Unified aggregator: google + bing + wiki + rss + event registry + fallback
198
  ###############################################################################
 
199
  def perform_deep_research(topic: str) -> str:
200
  """
201
- 1) Google (up to 10) if creds
202
- 2) Bing (up to 10) if SERP_API_KEY
203
- 3) Wikipedia summary
204
- 4) RSS approach
205
- 5) Event Registry (news api) if NEWS_API_KEY
206
- 6) If still nothing, use LLM fallback
 
 
 
207
  """
 
 
 
208
  # Step 1: Google
209
  google_cse_id = os.environ.get("GOOGLE_CSE_ID")
210
  google_api_key = os.environ.get("GOOGLE_API_KEY")
@@ -217,12 +320,12 @@ def perform_deep_research(topic: str) -> str:
217
  "q": topic,
218
  "cx": google_cse_id,
219
  "key": google_api_key,
220
- "num": 10
221
  }
222
  resp = requests.get(url, params=params, timeout=15)
223
  resp.raise_for_status()
224
  data = resp.json()
225
- items = data.get("items", [])
226
  for it in items:
227
  google_sources.append({
228
  "title": it.get("title", ""),
@@ -233,7 +336,7 @@ def perform_deep_research(topic: str) -> str:
233
  print("[ERROR] Google approach failed:", e)
234
 
235
  # Step 2: Bing
236
- bing_results = fetch_bing_results(topic, count=10)
237
 
238
  # Step 3: Wikipedia summary
239
  wiki_summary_text = fetch_wikipedia_summary(topic)
@@ -245,7 +348,8 @@ def perform_deep_research(topic: str) -> str:
245
  "snippet": wiki_summary_text
246
  }
247
 
248
- # Step 4: RSS approach
 
249
  sources_dict = {
250
  "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
251
  "CNN": "http://rss.cnn.com/rss/edition.rss",
@@ -255,8 +359,7 @@ def perform_deep_research(topic: str) -> str:
255
  "The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
256
  "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
257
  }
258
- rss_sources = []
259
- for name, feed_url in sources_dict.items():
260
  try:
261
  items = fetch_rss_feed(feed_url)
262
  if not items:
@@ -281,9 +384,9 @@ def perform_deep_research(topic: str) -> str:
281
  continue
282
 
283
  # Step 5: Event Registry
284
- event_registry_res = fetch_eventregistry_articles(topic, count=10)
285
 
286
- # Combine everything
287
  combined = []
288
  combined.extend(google_sources)
289
  combined.extend(bing_results)
@@ -305,42 +408,26 @@ def perform_deep_research(topic: str) -> str:
305
  }]
306
  return _draft_professional_report(topic, fallback_data)
307
  else:
308
- # Rewrite each
309
- final_list = []
310
- idx = 0
311
- for source in combined:
312
- idx += 1
313
- link = source.get("link", "")
314
- snippet = source.get("snippet", "")
315
- title = source.get("title", "")
316
-
317
- cleaned_text = rewrite_in_professional_style(topic, snippet)
318
- if cleaned_text.strip():
319
- final_list.append({
320
- "index": idx,
321
- "title": title,
322
- "link": link,
323
- "cleaned_text": cleaned_text
324
- })
325
 
326
- if not final_list:
327
- print("[LOG] Aggregator produced no final content after rewriting. Using LLM fallback.")
328
- # LLM-based fallback
329
- fallback_text = query_llm_for_additional_info(topic, "")
330
- cleaned_fb = rewrite_in_professional_style(topic, fallback_text)
331
- fallback_data = [{
332
- "index": 1,
333
- "title": "Fallback Info",
334
- "link": "N/A",
335
- "cleaned_text": cleaned_fb
336
- }]
337
- return _draft_professional_report(topic, fallback_data)
338
-
339
- return _draft_professional_report(topic, final_list)
340
 
341
  def _draft_professional_report(topic: str, sources_list: list) -> str:
342
  """
343
- Build final professional doc:
344
  - Title
345
  - Executive Summary
346
  - Introduction
@@ -349,6 +436,13 @@ def _draft_professional_report(topic: str, sources_list: list) -> str:
349
  - Conclusion
350
  - References footnotes
351
  Ensures at least ~1000 words.
 
 
 
 
 
 
 
352
  """
353
  merged_text = []
354
  footnotes = []
@@ -362,32 +456,32 @@ def _draft_professional_report(topic: str, sources_list: list) -> str:
362
  merged_text.append(text_block)
363
  all_content = "\n\n".join(merged_text)
364
 
365
- # ENFORCE LONGER REPORT (~1000 words).
366
  system_prompt = f"""You are a highly skilled professional research analyst.
367
- You have access to multiple authoritative sources on the topic: {topic}.
368
- Your task is to produce a comprehensive and detailed formal research report that includes the following sections:
369
 
370
- 1. **Title:** Use the topic as the title of the report.
371
- 2. **Executive Summary:** Provide a concise overview highlighting the key findings and insights.
 
372
  3. **Introduction:** Introduce the topic, its relevance, and the scope of the report.
373
  4. **Main Body:**
374
- - **Sub-heading 1:** Summarize insights from Source 1.
375
- - **Sub-heading 2:** Summarize insights from Source 2.
376
- - *(Continue as needed for all sources)*
377
- - **Analysis:** Provide an in-depth analysis combining information from all sources.
378
- 5. **Conclusion:** Present final thoughts, implications, and potential future directions.
379
  6. **References:** List all sources with numeric footnote markers [^1], [^2], etc.
380
 
381
  **Requirements:**
382
- - **Length:** The report must be at least **1,000 words** in total.
383
  - **Content Quality:**
384
  - Incorporate relevant facts, figures, and statistics.
385
  - Use professional and clear language.
386
  - Ensure each section is well-developed without unnecessary repetition.
387
- - **Structure:** Maintain a logical and cohesive flow throughout the report.
388
- - **Formatting:** Use proper formatting for headings, sub-headings, and references.
389
 
390
- **Below is the aggregated content from your sources (with footnote references):**
391
  -----------------------------------------------------------------------
392
  {all_content}
393
  -----------------------------------------------------------------------
@@ -404,18 +498,16 @@ Your task is to produce a comprehensive and detailed formal research report that
404
  # Calculate token counts
405
  max_tokens = 6000 # OpenRouter's token limit
406
  system_prompt_tokens = count_tokens(system_prompt)
407
- all_content_tokens = count_tokens(all_content)
408
- total_tokens = system_prompt_tokens + all_content_tokens
409
 
410
- print(f"[DEBUG] Total tokens before optimization: {total_tokens}")
411
 
412
- if total_tokens > max_tokens:
413
  # Calculate allowed tokens for all_content
414
  allowed_tokens_for_content = max_tokens - system_prompt_tokens - 100 # Reserve 100 tokens buffer
415
  if allowed_tokens_for_content <= 0:
416
  print("[ERROR] System prompt alone exceeds the token limit.")
417
  return "The system prompt exceeds the token limit. Please reduce the complexity of your research."
418
-
419
  # Truncate all_content to fit
420
  tokenizer = tiktoken.get_encoding("cl100k_base")
421
  all_content_tokens_list = tokenizer.encode(all_content)
@@ -428,7 +520,7 @@ Your task is to produce a comprehensive and detailed formal research report that
428
  response = call_deepseek_api(
429
  system_prompt=system_prompt,
430
  user_prompt="", # No additional user prompt
431
- max_tokens=3000, # Increased to allow more detailed output
432
  temperature=0.7
433
  )
434
  final_report = response.strip()
@@ -441,6 +533,10 @@ Your task is to produce a comprehensive and detailed formal research report that
441
  print("[ERROR] Could not finalize professional report:", e)
442
  return "An unexpected error occurred. Please try again later."
443
 
 
 
 
 
444
  def call_deepseek_api(system_prompt: str, user_prompt: str, max_tokens: int, temperature: float) -> str:
445
  """
446
  Function to call DeepSeek R1 via OpenRouter API.
@@ -465,8 +561,17 @@ def call_deepseek_api(system_prompt: str, user_prompt: str, max_tokens: int, tem
465
  }
466
  response = requests.post("https://openrouter.ai/api/v1/chat/completions",
467
  headers=headers, data=json.dumps(data))
468
- response.raise_for_status()
469
- return response.json()["choices"][0]["message"]["content"]
 
 
 
 
 
 
 
 
 
470
  except requests.exceptions.HTTPError as e:
471
  status_code = e.response.status_code
472
  error_content = e.response.json()
@@ -483,108 +588,14 @@ def call_deepseek_api(system_prompt: str, user_prompt: str, max_tokens: int, tem
483
  print("[ERROR] Could not communicate with OpenRouter API:", e)
484
  raise ValueError("An unexpected error occurred. Please try again later.")
485
 
486
- def generate_pdf_from_markdown(markdown_text: str) -> bytes:
487
- """
488
- Converts Markdown text to a PDF file.
489
-
490
- Args:
491
- markdown_text (str): The Markdown content to convert.
492
 
493
- Returns:
494
- bytes: The generated PDF file in bytes.
495
- """
496
- try:
497
- # Convert Markdown to HTML
498
- html = markdown.markdown(markdown_text)
499
-
500
- # Generate PDF from HTML
501
- pdf_bytes = pdfkit.from_string(html, False) # False to return as bytes
502
-
503
- return pdf_bytes
504
- except Exception as e:
505
- print(f"[ERROR] Failed to generate PDF from Markdown: {e}")
506
- return b""
507
-
508
- def fetch_wikipedia_summary(topic: str) -> str:
509
- print("[LOG] Fetching Wikipedia summary for:", topic)
510
- try:
511
- search_url = (
512
- f"https://en.wikipedia.org/w/api.php?action=opensearch&search={requests.utils.quote(topic)}"
513
- "&limit=1&namespace=0&format=json"
514
- )
515
- resp = requests.get(search_url)
516
- if resp.status_code != 200:
517
- print(f"[ERROR] Failed to fetch Wikipedia search results for {topic}")
518
- return ""
519
- data = resp.json()
520
- if len(data) > 1 and data[1]:
521
- title = data[1][0]
522
- summary_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(title)}"
523
- s_resp = requests.get(summary_url)
524
- if s_resp.status_code == 200:
525
- s_data = s_resp.json()
526
- if "extract" in s_data:
527
- print("[LOG] Wikipedia summary fetched successfully.")
528
- return s_data["extract"]
529
- return ""
530
- except Exception as e:
531
- print(f"[ERROR] Exception during Wikipedia summary fetch: {e}")
532
- return ""
533
-
534
- def fetch_rss_feed(feed_url: str) -> list:
535
- print("[LOG] Fetching RSS feed:", feed_url)
536
- try:
537
- resp = requests.get(feed_url)
538
- if resp.status_code != 200:
539
- print(f"[ERROR] Failed to fetch RSS feed: {feed_url}")
540
- return []
541
- soup = BeautifulSoup(resp.content, "xml")
542
- items = soup.find_all("item")
543
- return items
544
- except Exception as e:
545
- print(f"[ERROR] Exception fetching RSS feed {feed_url}: {e}")
546
- return []
547
-
548
- def find_relevant_article(items, topic: str, min_match=2) -> tuple:
549
- print("[LOG] Finding relevant articles...")
550
- keywords = re.findall(r'\w+', topic.lower())
551
- for item in items:
552
- title = item.find("title").get_text().strip() if item.find("title") else ""
553
- description = item.find("description").get_text().strip() if item.find("description") else ""
554
- text = (title + " " + description).lower()
555
- matches = sum(1 for kw in keywords if kw in text)
556
- if matches >= min_match:
557
- link = item.find("link").get_text().strip() if item.find("link") else ""
558
- print(f"[LOG] Relevant article found: {title}")
559
- return title, description, link
560
- return None, None, None
561
-
562
- def fetch_article_text(link: str) -> str:
563
- print("[LOG] Fetching article text from:", link)
564
- if not link:
565
- print("[LOG] No link provided for article text.")
566
- return ""
567
- try:
568
- resp = requests.get(link)
569
- if resp.status_code != 200:
570
- print(f"[ERROR] Failed to fetch article from {link}")
571
- return ""
572
- soup = BeautifulSoup(resp.text, 'html.parser')
573
- paragraphs = soup.find_all("p")
574
- text = " ".join(p.get_text() for p in paragraphs[:10]) # Fetch more paragraphs for depth
575
- print("[LOG] Article text fetched successfully.")
576
- return text.strip()
577
- except Exception as e:
578
- print(f"[ERROR] Error fetching article text: {e}")
579
- return ""
580
-
581
- ###############################################################################
582
- # Comprehensive Audio Generation Function
583
- ###############################################################################
584
- def generate_audio_mp3(text: str, speaker: str) -> str:
585
- """
586
- This function is correctly generating and returning the actual MP3 file path.
587
- It utilizes Deepgram for English (American) and Murf for other languages.
588
  """
589
  try:
590
  import streamlit as st
@@ -709,54 +720,367 @@ def generate_audio_mp3(text: str, speaker: str) -> str:
709
  print(f"[DEBUG] Generated MP3 Path: {final_mp3_path}")
710
  return final_mp3_path
711
 
712
- except Exception as e:
713
- print("[ERROR] Error generating audio:", e)
714
- raise ValueError(f"Error generating audio: {str(e)}")
715
-
716
- def transcribe_youtube_video(video_url: str) -> str:
717
- print("[LOG] Transcribing YouTube video via RapidAPI:", video_url)
718
- video_id_match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", video_url)
719
- if not video_id_match:
720
- raise ValueError(f"Invalid YouTube URL: {video_url}, cannot extract video ID.")
721
-
722
- video_id = video_id_match.group(1)
723
- print("[LOG] Extracted video ID:", video_id)
724
-
725
- base_url = "https://youtube-transcriptor.p.rapidapi.com/transcript"
726
- params = {"video_id": video_id, "lang": "en"}
727
- headers = {
728
- "x-rapidapi-host": "youtube-transcriptor.p.rapidapi.com",
729
- "x-rapidapi-key": os.environ.get("RAPIDAPI_KEY")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
730
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731
 
732
  try:
733
- response = requests.get(base_url, headers=headers, params=params, timeout=30)
734
- print("[LOG] RapidAPI Response Status Code:", response.status_code)
735
- print("[LOG] RapidAPI Response Body:", response.text)
 
 
 
 
 
 
 
 
 
 
 
 
736
 
737
- if response.status_code != 200:
738
- raise ValueError(f"RapidAPI transcription error: {response.status_code}, {response.text}")
 
 
 
 
 
 
 
 
739
 
740
- data = response.json()
741
- if not isinstance(data, list) or not data:
742
- raise ValueError(f"Unexpected transcript format or empty transcript: {data}")
 
 
 
 
 
743
 
744
- transcript_as_text = data[0].get('transcriptionAsText', '').strip()
745
- if not transcript_as_text:
746
- raise ValueError("transcriptionAsText field is missing or empty.")
747
 
748
- print("[LOG] Transcript retrieval successful.")
749
- print(f"[DEBUG] Transcript Length: {len(transcript_as_text)} characters.")
750
- snippet = transcript_as_text[:200] + "..." if len(transcript_as_text) > 200 else transcript_as_text
751
- print(f"[DEBUG] Transcript Snippet: {snippet}")
 
 
 
 
752
 
753
- return transcript_as_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
754
  except Exception as e:
755
- print("[ERROR] RapidAPI transcription error:", e)
756
- raise ValueError(f"Error transcribing YouTube video via RapidAPI: {str(e)}")
 
 
 
 
 
 
 
 
 
757
 
758
  ###############################################################################
759
- # generate_script Function and Helper
760
  ###############################################################################
761
 
762
  def generate_script(
@@ -826,7 +1150,7 @@ def parse_script_to_dialogue(script_text: str, host_name: str, guest_name: str)
826
  List[DialogueItem]: A list of DialogueItem objects.
827
  """
828
  # Define a regex pattern to identify lines like "HostName: Dialogue"
829
- pattern = r"(?i)\b({host}|{guest})\b:\s*(.*)".format(host=re.escape(host_name), guest=re.escape(guest_name))
830
  matches = re.findall(pattern, script_text)
831
 
832
  dialogue_items = []
@@ -844,53 +1168,6 @@ def parse_script_to_dialogue(script_text: str, host_name: str, guest_name: str)
844
  # Additional Helper Functions (if any)
845
  ###############################################################################
846
 
847
- def _preprocess_text_for_tts(text: str, speaker: str) -> str:
848
- # Unchanged logic for adding filler words, etc.
849
- text = re.sub(r"\bNo\.\b", "Number", text)
850
- text = re.sub(r"\b(?i)SaaS\b", "sass", text)
851
- abbreviations_as_words = {"NASA", "NATO", "UNESCO"}
852
-
853
- def insert_periods_for_abbrev(m):
854
- abbr = m.group(0)
855
- if abbr in abbreviations_as_words:
856
- return abbr
857
- return ".".join(list(abbr)) + "."
858
-
859
- text = re.sub(r"\b([A-Z]{2,})\b", insert_periods_for_abbrev, text)
860
- text = re.sub(r"\.\.", ".", text)
861
-
862
- def remove_periods_for_tts(m):
863
- return m.group().replace(".", " ").strip()
864
-
865
- text = re.sub(r"[A-Z]\.[A-Z](?:\.[A-Z])*\.", remove_periods_for_tts, text)
866
- text = re.sub(r"-", " ", text)
867
- text = re.sub(r"\b(ha(ha)?|heh|lol)\b", "(* laughs *)", text, flags=re.IGNORECASE)
868
- text = re.sub(r"\bsigh\b", "(* sighs *)", text, flags=re.IGNORECASE)
869
- text = re.sub(r"\b(groan|moan)\b", "(* groans *)", text, flags=re.IGNORECASE)
870
-
871
- if speaker != "Jane":
872
- def insert_thinking_pause(m):
873
- word = m.group(1)
874
- if random.random() < 0.3:
875
- filler = random.choice(['hmm,', 'well,', 'let me see,'])
876
- return f"{word}..., {filler}"
877
- else:
878
- return f"{word}...,"
879
-
880
- keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
881
- text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
882
-
883
- conj_pattern = r"\b(and|but|so|because|however)\b"
884
- text = re.sub(conj_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
885
-
886
- text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
887
-
888
- def capitalize_match(m):
889
- return m.group().upper()
890
-
891
- text = re.sub(r'(^\s*\w)|([.!?]\s*\w)', capitalize_match, text)
892
- return text.strip()
893
-
894
  def _spell_digits(d: str) -> str:
895
  digit_map = {
896
  '0': 'zero', '1': 'one', '2': 'two', '3': 'three',
@@ -899,35 +1176,6 @@ def _spell_digits(d: str) -> str:
899
  }
900
  return " ".join(digit_map[ch] for ch in d if ch in digit_map)
901
 
902
- def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
903
- # unchanged
904
- if custom_music_path:
905
- music_path = custom_music_path
906
- else:
907
- music_path = "bg_music.mp3"
908
-
909
- if not os.path.exists(music_path):
910
- print(f"[ERROR] Background music file not found: {music_path}")
911
- return spoken # Return spoken audio without background music
912
-
913
- try:
914
- bg_music = AudioSegment.from_file(music_path, format="mp3")
915
- except Exception as e:
916
- print("[ERROR] Failed to load background music:", e)
917
- return spoken
918
-
919
- bg_music = bg_music - 18.0
920
- total_length_ms = len(spoken) + 2000
921
- looped_music = AudioSegment.empty()
922
- while len(looped_music) < total_length_ms:
923
- looped_music += bg_music
924
- looped_music = looped_music[:total_length_ms]
925
- final_mix = looped_music.overlay(spoken, position=2000)
926
- return final_mix
927
-
928
  ###############################################################################
929
- # Unified aggregator: google + bing + wiki + rss + event registry + fallback
930
  ###############################################################################
931
- # The perform_deep_research function is already defined above.
932
-
933
- # No need to redefine perform_deep_research again.
 
1
  import os
 
2
  import json
3
+ import re
4
  import requests
5
  import tempfile
6
+ from typing import List
 
7
  from pydantic import BaseModel
8
+ from bs4 import BeautifulSoup
9
  from pydub import AudioSegment, effects
 
 
10
  import tiktoken
 
 
 
11
 
12
+ # Define Pydantic Models
 
 
 
 
 
13
  class DialogueItem(BaseModel):
14
+ speaker: str
15
+ display_speaker: str
16
  text: str
17
 
18
  class Dialogue(BaseModel):
19
  dialogue: List[DialogueItem]
20
 
21
+ ###############################################################################
22
+ # Helper Functions
23
+ ###############################################################################
24
+
25
+ def extract_text_from_pdf(pdf_path: str) -> str:
26
+ print("[LOG] Extracting text from PDF:", pdf_path)
27
+ try:
28
+ reader = pypdf.PdfReader(pdf_path)
29
+ text = ""
30
+ for page_num, page in enumerate(reader.pages):
31
+ page_text = page.extract_text()
32
+ if page_text:
33
+ text += page_text + "\n"
34
+ print("[LOG] Text extraction from PDF successful.")
35
+ return text
36
+ except Exception as e:
37
+ print(f"[ERROR] Failed to extract text from PDF: {e}")
38
+ return ""
39
+
40
+ def extract_text_from_url(url: str) -> str:
41
  print("[LOG] Extracting text from URL:", url)
42
  try:
43
  headers = {
 
61
  print(f"[ERROR] Exception during text extraction from URL: {e}")
62
  return ""
63
 
64
+ def fetch_wikipedia_summary(topic: str) -> str:
65
+ print("[LOG] Fetching Wikipedia summary for:", topic)
66
+ try:
67
+ search_url = (
68
+ f"https://en.wikipedia.org/w/api.php?action=opensearch&search={requests.utils.quote(topic)}"
69
+ "&limit=1&namespace=0&format=json"
70
+ )
71
+ resp = requests.get(search_url)
72
+ if resp.status_code != 200:
73
+ print(f"[ERROR] Failed to fetch Wikipedia search results for {topic}")
74
+ return ""
75
+ data = resp.json()
76
+ if len(data) > 1 and data[1]:
77
+ title = data[1][0]
78
+ summary_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(title)}"
79
+ s_resp = requests.get(summary_url)
80
+ if s_resp.status_code == 200:
81
+ s_data = s_resp.json()
82
+ if "extract" in s_data:
83
+ print("[LOG] Wikipedia summary fetched successfully.")
84
+ return s_data["extract"]
85
+ return ""
86
+ except Exception as e:
87
+ print(f"[ERROR] Exception during Wikipedia summary fetch: {e}")
88
  return ""
89
 
90
+ def fetch_rss_feed(feed_url: str) -> list:
91
+ print("[LOG] Fetching RSS feed:", feed_url)
92
+ try:
93
+ resp = requests.get(feed_url)
94
+ if resp.status_code != 200:
95
+ print(f"[ERROR] Failed to fetch RSS feed: {feed_url}")
96
+ return []
97
+ soup = BeautifulSoup(resp.content, "xml")
98
+ items = soup.find_all("item")
99
+ return items
100
+ except Exception as e:
101
+ print(f"[ERROR] Exception fetching RSS feed {feed_url}: {e}")
102
+ return []
103
 
104
+ def find_relevant_article(items, topic: str, min_match=2) -> tuple:
105
+ print("[LOG] Finding relevant articles...")
106
+ keywords = re.findall(r'\w+', topic.lower())
107
+ for item in items:
108
+ title = item.find("title").get_text().strip() if item.find("title") else ""
109
+ description = item.find("description").get_text().strip() if item.find("description") else ""
110
+ text = (title + " " + description).lower()
111
+ matches = sum(1 for kw in keywords if kw in text)
112
+ if matches >= min_match:
113
+ link = item.find("link").get_text().strip() if item.find("link") else ""
114
+ print(f"[LOG] Relevant article found: {title}")
115
+ return title, description, link
116
+ return None, None, None
117
 
118
+ def fetch_article_text(link: str) -> str:
119
+ print("[LOG] Fetching article text from:", link)
120
+ if not link:
121
+ print("[LOG] No link provided for article text.")
122
+ return ""
123
  try:
124
+ resp = requests.get(link)
125
+ if resp.status_code != 200:
126
+ print(f"[ERROR] Failed to fetch article from {link}")
127
+ return ""
128
+ soup = BeautifulSoup(resp.text, 'html.parser')
129
+ paragraphs = soup.find_all("p")
130
+ text = " ".join(p.get_text() for p in paragraphs[:10]) # Fetch more paragraphs for depth
131
+ print("[LOG] Article text fetched successfully.")
132
+ return text.strip()
133
  except Exception as e:
134
+ print(f"[ERROR] Error fetching article text: {e}")
135
+ return ""
136
 
 
 
 
137
  def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
138
  """
139
  Query https://eventregistry.org/api/v1/article/getArticles
 
182
  print("[ERROR] Event Registry approach failed:", e)
183
  return []
184
 
 
 
 
185
  def fetch_bing_results(query: str, count: int = 10) -> list:
186
  serp_api_key = os.environ.get("SERP_API_KEY")
187
  if not serp_api_key:
 
211
  print("[ERROR] Bing SerpApi approach failed:", e)
212
  return []
213
 
214
+ ###############################################################################
215
+ # Summarization Function
216
+ ###############################################################################
217
+
218
+ def summarize_text(text: str, max_length: int = 200) -> str:
219
+ """
220
+ Summarizes the given text to the specified maximum word length.
221
+
222
+ Args:
223
+ text (str): The text to summarize.
224
+ max_length (int): The maximum number of words in the summary.
225
+
226
+ Returns:
227
+ str: The summarized text.
228
+ """
229
+ system_prompt = (
230
+ f"You are a professional summarizer. Please condense the following text "
231
+ f"into a summary of no more than {max_length} words, retaining the main ideas and key details."
232
+ )
233
+ user_prompt = text
234
+
235
+ try:
236
+ summary = call_deepseek_api(
237
+ system_prompt=system_prompt,
238
+ user_prompt=user_prompt,
239
+ max_tokens=500, # Adjust as needed
240
+ temperature=0.5
241
+ )
242
+ return summary.strip()
243
+ except Exception as e:
244
+ print(f"[ERROR] Summarization failed: {e}")
245
+ # Fallback: return the original text truncated to max_length words
246
+ return " ".join(text.split()[:max_length]) + "..."
247
+
248
+ ###############################################################################
249
+ # Rewrites text in professional style
250
+ ###############################################################################
251
+
252
+ def rewrite_in_professional_style(topic: str, raw_text: str) -> str:
253
+ if not raw_text.strip():
254
+ return ""
255
+
256
+ system_prompt = (
257
+ "You are a professional writing assistant. Your goal is to rewrite "
258
+ "the provided text so that it is:\n"
259
+ "1) Written in clear, fluent, professional English\n"
260
+ f"2) On-topic about {topic}, removing any extraneous disclaimers or filler\n"
261
+ "3) Organized in paragraphs or bullet points\n"
262
+ "4) Maintained or slightly enhanced in detail without significant summarization\n"
263
+ "5) No references to the rewriting process or disclaimers\n"
264
+ )
265
+
266
+ user_prompt = f"Please rewrite this text:\n\n{raw_text}"
267
+
268
+ try:
269
+ rewritten = call_deepseek_api(
270
+ system_prompt=system_prompt,
271
+ user_prompt=user_prompt,
272
+ max_tokens=1024,
273
+ temperature=0.7
274
+ )
275
+ # Optionally, summarize the rewritten text to further reduce token count
276
+ summary = summarize_text(rewritten, max_length=150)
277
+ return summary
278
+ except Exception as e:
279
+ print("[ERROR] rewriting text via Deepseek LLM failed:", e)
280
+ return raw_text
281
+
282
+ ###############################################################################
283
+ # Event Registry (News API) aggregator
284
+ ###############################################################################
285
+ # Already handled in fetch_eventregistry_articles
286
+
287
+ ###############################################################################
288
+ # Bing via SerpApi
289
+ ###############################################################################
290
+ # Already handled in fetch_bing_results
291
+
292
  ###############################################################################
293
  # Unified aggregator: google + bing + wiki + rss + event registry + fallback
294
  ###############################################################################
295
+
296
  def perform_deep_research(topic: str) -> str:
297
  """
298
+ Perform deep research by aggregating data from multiple sources.
299
+ Limits the number of sources to prevent exceeding token limits.
300
+ Summarizes each source's content to reduce token count.
301
+
302
+ Args:
303
+ topic (str): The research topic.
304
+
305
+ Returns:
306
+ str: The final professional report in Markdown format.
307
  """
308
+ # Define the maximum number of sources per aggregator
309
+ MAX_SOURCES_PER_AGGREGATOR = 5
310
+
311
  # Step 1: Google
312
  google_cse_id = os.environ.get("GOOGLE_CSE_ID")
313
  google_api_key = os.environ.get("GOOGLE_API_KEY")
 
320
  "q": topic,
321
  "cx": google_cse_id,
322
  "key": google_api_key,
323
+ "num": 10 # Fetch more to account for filtering
324
  }
325
  resp = requests.get(url, params=params, timeout=15)
326
  resp.raise_for_status()
327
  data = resp.json()
328
+ items = data.get("items", [])[:MAX_SOURCES_PER_AGGREGATOR]
329
  for it in items:
330
  google_sources.append({
331
  "title": it.get("title", ""),
 
336
  print("[ERROR] Google approach failed:", e)
337
 
338
  # Step 2: Bing
339
+ bing_results = fetch_bing_results(topic, count=10)[:MAX_SOURCES_PER_AGGREGATOR]
340
 
341
  # Step 3: Wikipedia summary
342
  wiki_summary_text = fetch_wikipedia_summary(topic)
 
348
  "snippet": wiki_summary_text
349
  }
350
 
351
+ # Step 4: RSS approach (NewsAPI assumed here)
352
+ rss_sources = []
353
  sources_dict = {
354
  "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
355
  "CNN": "http://rss.cnn.com/rss/edition.rss",
 
359
  "The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
360
  "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
361
  }
362
+ for name, feed_url in list(sources_dict.items())[:MAX_SOURCES_PER_AGGREGATOR]:
 
363
  try:
364
  items = fetch_rss_feed(feed_url)
365
  if not items:
 
384
  continue
385
 
386
  # Step 5: Event Registry
387
+ event_registry_res = fetch_eventregistry_articles(topic, count=10)[:MAX_SOURCES_PER_AGGREGATOR]
388
 
389
+ # Combine all sources
390
  combined = []
391
  combined.extend(google_sources)
392
  combined.extend(bing_results)
 
408
  }]
409
  return _draft_professional_report(topic, fallback_data)
410
  else:
411
+ # Summarize each source's snippet to reduce token count
412
+ summarized_list = []
413
+ for idx, source in enumerate(combined, start=1):
414
+ summary = summarize_text(source["snippet"], max_length=200) # Summarize to 200 words
415
+ summarized_list.append({
416
+ "index": idx,
417
+ "title": source["title"],
418
+ "link": source["link"],
419
+ "cleaned_text": summary
420
+ })
421
+
422
+ return _draft_professional_report(topic, summarized_list)
 
 
 
 
 
423
 
424
+ ###############################################################################
425
+ # Professional Report Drafting Function
426
+ ###############################################################################
 
 
 
 
 
 
 
 
 
 
 
427
 
428
  def _draft_professional_report(topic: str, sources_list: list) -> str:
429
  """
430
+ Build a concise professional report:
431
  - Title
432
  - Executive Summary
433
  - Introduction
 
436
  - Conclusion
437
  - References footnotes
438
  Ensures at least ~1000 words.
439
+
440
+ Args:
441
+ topic (str): The research topic.
442
+ sources_list (list): List of summarized sources.
443
+
444
+ Returns:
445
+ str: The final professional report in Markdown format.
446
  """
447
  merged_text = []
448
  footnotes = []
 
456
  merged_text.append(text_block)
457
  all_content = "\n\n".join(merged_text)
458
 
459
+ # Build the system prompt
460
  system_prompt = f"""You are a highly skilled professional research analyst.
461
+ You are tasked with creating a comprehensive and detailed formal research report on the topic: {topic}.
 
462
 
463
+ **Report Structure:**
464
+ 1. **Title:** {topic}
465
+ 2. **Executive Summary:** A concise overview of key findings and insights.
466
  3. **Introduction:** Introduce the topic, its relevance, and the scope of the report.
467
  4. **Main Body:**
468
+ - **Section 1:** Insights from Source 1.
469
+ - **Section 2:** Insights from Source 2.
470
+ - *(Continue as needed)*
471
+ - **Analysis:** An in-depth analysis combining information from all sources.
472
+ 5. **Conclusion:** Final thoughts, implications, and potential future directions.
473
  6. **References:** List all sources with numeric footnote markers [^1], [^2], etc.
474
 
475
  **Requirements:**
476
+ - **Length:** At least 1,000 words.
477
  - **Content Quality:**
478
  - Incorporate relevant facts, figures, and statistics.
479
  - Use professional and clear language.
480
  - Ensure each section is well-developed without unnecessary repetition.
481
+ - **Structure:** Logical and cohesive flow throughout the report.
482
+ - **Formatting:** Proper formatting for headings, sub-headings, and references.
483
 
484
+ **Aggregated Content from Sources:**
485
  -----------------------------------------------------------------------
486
  {all_content}
487
  -----------------------------------------------------------------------
 
498
  # Calculate token counts
499
  max_tokens = 6000 # OpenRouter's token limit
500
  system_prompt_tokens = count_tokens(system_prompt)
 
 
501
 
502
+ print(f"[DEBUG] Total tokens before optimization: {system_prompt_tokens}")
503
 
504
+ if system_prompt_tokens > max_tokens:
505
  # Calculate allowed tokens for all_content
506
  allowed_tokens_for_content = max_tokens - system_prompt_tokens - 100 # Reserve 100 tokens buffer
507
  if allowed_tokens_for_content <= 0:
508
  print("[ERROR] System prompt alone exceeds the token limit.")
509
  return "The system prompt exceeds the token limit. Please reduce the complexity of your research."
510
+
511
  # Truncate all_content to fit
512
  tokenizer = tiktoken.get_encoding("cl100k_base")
513
  all_content_tokens_list = tokenizer.encode(all_content)
 
520
  response = call_deepseek_api(
521
  system_prompt=system_prompt,
522
  user_prompt="", # No additional user prompt
523
+ max_tokens=3000, # Adjusted to allow more detailed output
524
  temperature=0.7
525
  )
526
  final_report = response.strip()
 
533
  print("[ERROR] Could not finalize professional report:", e)
534
  return "An unexpected error occurred. Please try again later."
535
 
536
+ ###############################################################################
537
+ # OpenRouter API Communication Function
538
+ ###############################################################################
539
+
540
  def call_deepseek_api(system_prompt: str, user_prompt: str, max_tokens: int, temperature: float) -> str:
541
  """
542
  Function to call DeepSeek R1 via OpenRouter API.
 
561
  }
562
  response = requests.post("https://openrouter.ai/api/v1/chat/completions",
563
  headers=headers, data=json.dumps(data))
564
+ if response.status_code != 200:
565
+ error_message = response.json().get("error", {}).get("message", "Unknown error")
566
+ print(f"[ERROR] OpenRouter API error: {response.status_code} - {error_message}")
567
+ raise ValueError(f"OpenRouter API error: {response.status_code} - {error_message}")
568
+
569
+ response_json = response.json()
570
+ if "choices" not in response_json or not response_json["choices"]:
571
+ print("[ERROR] 'choices' key missing in OpenRouter API response.")
572
+ raise ValueError("Invalid response from OpenRouter API: 'choices' key missing.")
573
+
574
+ return response_json["choices"][0]["message"]["content"]
575
  except requests.exceptions.HTTPError as e:
576
  status_code = e.response.status_code
577
  error_content = e.response.json()
 
588
  print("[ERROR] Could not communicate with OpenRouter API:", e)
589
  raise ValueError("An unexpected error occurred. Please try again later.")
590
 
591
+ ###############################################################################
592
+ # Comprehensive Audio Generation Function
593
+ ###############################################################################
 
 
 
594
 
595
+ def generate_audio_mp3(text: str, speaker: str) -> str:
596
+ """
597
+ Generates and returns the actual MP3 file path.
598
+ Utilizes Deepgram for English (American) and Murf for other languages.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
  """
600
  try:
601
  import streamlit as st
 
720
  print(f"[DEBUG] Generated MP3 Path: {final_mp3_path}")
721
  return final_mp3_path
722
 
723
+ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
724
+ # Unchanged logic for adding filler words, etc.
725
+ text = re.sub(r"\bNo\.\b", "Number", text)
726
+ text = re.sub(r"\b(?i)SaaS\b", "sass", text)
727
+ abbreviations_as_words = {"NASA", "NATO", "UNESCO"}
728
+
729
+ def insert_periods_for_abbrev(m):
730
+ abbr = m.group(0)
731
+ if abbr in abbreviations_as_words:
732
+ return abbr
733
+ return ".".join(list(abbr)) + "."
734
+
735
+ text = re.sub(r"\b([A-Z]{2,})\b", insert_periods_for_abbrev, text)
736
+ text = re.sub(r"\.\.", ".", text)
737
+
738
+ def remove_periods_for_tts(m):
739
+ return m.group().replace(".", " ").strip()
740
+
741
+ text = re.sub(r"[A-Z]\.[A-Z](?:\.[A-Z])*\.", remove_periods_for_tts, text)
742
+ text = re.sub(r"-", " ", text)
743
+ text = re.sub(r"\b(ha(ha)?|heh|lol)\b", "(* laughs *)", text, flags=re.IGNORECASE)
744
+ text = re.sub(r"\bsigh\b", "(* sighs *)", text, flags=re.IGNORECASE)
745
+ text = re.sub(r"\b(groan|moan)\b", "(* groans *)", text, flags=re.IGNORECASE)
746
+
747
+ if speaker != "Jane":
748
+ def insert_thinking_pause(m):
749
+ word = m.group(1)
750
+ if random.random() < 0.3:
751
+ filler = random.choice(['hmm,', 'well,', 'let me see,'])
752
+ return f"{word}..., {filler}"
753
+ else:
754
+ return f"{word}...,"
755
+
756
+ keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
757
+ text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
758
+
759
+ conj_pattern = r"\b(and|but|so|because|however)\b"
760
+ text = re.sub(conj_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
761
+
762
+ text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
763
+
764
+ def capitalize_match(m):
765
+ return m.group().upper()
766
+
767
+ text = re.sub(r'(^\s*\w)|([.!?]\s*\w)', capitalize_match, text)
768
+ return text.strip()
769
+
770
+ ###############################################################################
771
+ # Unified aggregator: google + bing + wiki + rss + event registry + fallback
772
+ ###############################################################################
773
+
774
+ def perform_deep_research(topic: str) -> str:
775
+ """
776
+ Perform deep research by aggregating data from multiple sources.
777
+ Limits the number of sources to prevent exceeding token limits.
778
+ Summarizes each source's content to reduce token count.
779
+
780
+ Args:
781
+ topic (str): The research topic.
782
+
783
+ Returns:
784
+ str: The final professional report in Markdown format.
785
+ """
786
+ # Define the maximum number of sources per aggregator
787
+ MAX_SOURCES_PER_AGGREGATOR = 5
788
+
789
+ # Step 1: Google
790
+ google_cse_id = os.environ.get("GOOGLE_CSE_ID")
791
+ google_api_key = os.environ.get("GOOGLE_API_KEY")
792
+ google_sources = []
793
+ if google_cse_id and google_api_key:
794
+ try:
795
+ print("[LOG] Attempting Google CSE for topic:", topic)
796
+ url = "https://customsearch.googleapis.com/customsearch/v1"
797
+ params = {
798
+ "q": topic,
799
+ "cx": google_cse_id,
800
+ "key": google_api_key,
801
+ "num": 10 # Fetch more to account for filtering
802
+ }
803
+ resp = requests.get(url, params=params, timeout=15)
804
+ resp.raise_for_status()
805
+ data = resp.json()
806
+ items = data.get("items", [])[:MAX_SOURCES_PER_AGGREGATOR]
807
+ for it in items:
808
+ google_sources.append({
809
+ "title": it.get("title", ""),
810
+ "link": it.get("link", ""),
811
+ "snippet": it.get("snippet", "")
812
+ })
813
+ except Exception as e:
814
+ print("[ERROR] Google approach failed:", e)
815
+
816
+ # Step 2: Bing
817
+ bing_results = fetch_bing_results(topic, count=10)[:MAX_SOURCES_PER_AGGREGATOR]
818
+
819
+ # Step 3: Wikipedia summary
820
+ wiki_summary_text = fetch_wikipedia_summary(topic)
821
+ wiki_item = None
822
+ if wiki_summary_text:
823
+ wiki_item = {
824
+ "title": "Wikipedia Summary",
825
+ "link": f"https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}",
826
+ "snippet": wiki_summary_text
827
+ }
828
+
829
+ # Step 4: RSS approach (NewsAPI assumed here)
830
+ rss_sources = []
831
+ sources_dict = {
832
+ "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
833
+ "CNN": "http://rss.cnn.com/rss/edition.rss",
834
+ "Associated Press": "https://apnews.com/apf-topnews",
835
+ "NDTV": "https://www.ndtv.com/rss/top-stories",
836
+ "Times of India": "https://timesofindia.indiatimes.com/rssfeeds/296589292.cms",
837
+ "The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
838
+ "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
839
  }
840
+ for name, feed_url in list(sources_dict.items())[:MAX_SOURCES_PER_AGGREGATOR]:
841
+ try:
842
+ items = fetch_rss_feed(feed_url)
843
+ if not items:
844
+ continue
845
+ title, desc, link = find_relevant_article(items, topic, min_match=2)
846
+ if link:
847
+ article_text = fetch_article_text(link)
848
+ if article_text:
849
+ rss_sources.append({
850
+ "title": f"{name} RSS Article",
851
+ "link": link,
852
+ "snippet": article_text
853
+ })
854
+ else:
855
+ rss_sources.append({
856
+ "title": f"{name} RSS Article",
857
+ "link": link,
858
+ "snippet": f"{title} - {desc}"
859
+ })
860
+ except Exception as e:
861
+ print(f"[ERROR] Error fetching from {name} RSS feed:", e)
862
+ continue
863
+
864
+ # Step 5: Event Registry
865
+ event_registry_res = fetch_eventregistry_articles(topic, count=10)[:MAX_SOURCES_PER_AGGREGATOR]
866
+
867
+ # Combine all sources
868
+ combined = []
869
+ combined.extend(google_sources)
870
+ combined.extend(bing_results)
871
+ if wiki_item:
872
+ combined.append(wiki_item)
873
+ combined.extend(rss_sources)
874
+ combined.extend(event_registry_res)
875
+
876
+ if not combined:
877
+ print("[LOG] No results found from aggregator. Using LLM fallback.")
878
+ # LLM-based fallback
879
+ fallback_text = query_llm_for_additional_info(topic, "")
880
+ cleaned_fb = rewrite_in_professional_style(topic, fallback_text)
881
+ fallback_data = [{
882
+ "index": 1,
883
+ "title": "Fallback Info",
884
+ "link": "N/A",
885
+ "cleaned_text": cleaned_fb
886
+ }]
887
+ return _draft_professional_report(topic, fallback_data)
888
+ else:
889
+ # Summarize each source's snippet to reduce token count
890
+ summarized_list = []
891
+ for idx, source in enumerate(combined, start=1):
892
+ summary = summarize_text(source["snippet"], max_length=200) # Summarize to 200 words
893
+ summarized_list.append({
894
+ "index": idx,
895
+ "title": source["title"],
896
+ "link": source["link"],
897
+ "cleaned_text": summary
898
+ })
899
+
900
+ return _draft_professional_report(topic, summarized_list)
901
+
902
+ ###############################################################################
903
+ # Professional Report Drafting Function
904
+ ###############################################################################
905
+
906
+ def _draft_professional_report(topic: str, sources_list: list) -> str:
907
+ """
908
+ Build a concise professional report:
909
+ - Title
910
+ - Executive Summary
911
+ - Introduction
912
+ - Main Body with sub-headings
913
+ - Analysis
914
+ - Conclusion
915
+ - References footnotes
916
+ Ensures at least ~1000 words.
917
+
918
+ Args:
919
+ topic (str): The research topic.
920
+ sources_list (list): List of summarized sources.
921
+
922
+ Returns:
923
+ str: The final professional report in Markdown format.
924
+ """
925
+ merged_text = []
926
+ footnotes = []
927
+ for s in sources_list:
928
+ footnotes.append(f"[^{s['index']}]: {s['link']}")
929
+ text_block = (
930
+ f"Source {s['index']} Title: {s['title']}\n"
931
+ f"FootnoteRef: [^{s['index']}]\n"
932
+ f"Text:\n{s['cleaned_text']}\n"
933
+ )
934
+ merged_text.append(text_block)
935
+ all_content = "\n\n".join(merged_text)
936
+
937
+ # Build the system prompt
938
+ system_prompt = f"""You are a highly skilled professional research analyst.
939
+ You are tasked with creating a comprehensive and detailed formal research report on the topic: {topic}.
940
+
941
+ **Report Structure:**
942
+ 1. **Title:** {topic}
943
+ 2. **Executive Summary:** A concise overview of key findings and insights.
944
+ 3. **Introduction:** Introduce the topic, its relevance, and the scope of the report.
945
+ 4. **Main Body:**
946
+ - **Section 1:** Insights from Source 1.
947
+ - **Section 2:** Insights from Source 2.
948
+ - *(Continue as needed)*
949
+ - **Analysis:** An in-depth analysis combining information from all sources.
950
+ 5. **Conclusion:** Final thoughts, implications, and potential future directions.
951
+ 6. **References:** List all sources with numeric footnote markers [^1], [^2], etc.
952
+
953
+ **Requirements:**
954
+ - **Length:** At least 1,000 words.
955
+ - **Content Quality:**
956
+ - Incorporate relevant facts, figures, and statistics.
957
+ - Use professional and clear language.
958
+ - Ensure each section is well-developed without unnecessary repetition.
959
+ - **Structure:** Logical and cohesive flow throughout the report.
960
+ - **Formatting:** Proper formatting for headings, sub-headings, and references.
961
+
962
+ **Aggregated Content from Sources:**
963
+ -----------------------------------------------------------------------
964
+ {all_content}
965
+ -----------------------------------------------------------------------
966
+ **Footnotes:**
967
+ {chr(10).join(footnotes)}
968
+ """
969
+
970
+ # Token Counting Function
971
+ def count_tokens(text: str) -> int:
972
+ tokenizer = tiktoken.get_encoding("cl100k_base")
973
+ tokens = tokenizer.encode(text)
974
+ return len(tokens)
975
+
976
+ # Calculate token counts
977
+ max_tokens = 6000 # OpenRouter's token limit
978
+ system_prompt_tokens = count_tokens(system_prompt)
979
+
980
+ print(f"[DEBUG] Total tokens before optimization: {system_prompt_tokens}")
981
+
982
+ if system_prompt_tokens > max_tokens:
983
+ # Calculate allowed tokens for all_content
984
+ allowed_tokens_for_content = max_tokens - system_prompt_tokens - 100 # Reserve 100 tokens buffer
985
+ if allowed_tokens_for_content <= 0:
986
+ print("[ERROR] System prompt alone exceeds the token limit.")
987
+ return "The system prompt exceeds the token limit. Please reduce the complexity of your research."
988
+
989
+ # Truncate all_content to fit
990
+ tokenizer = tiktoken.get_encoding("cl100k_base")
991
+ all_content_tokens_list = tokenizer.encode(all_content)
992
+ truncated_tokens = all_content_tokens_list[:allowed_tokens_for_content]
993
+ truncated_content = tokenizer.decode(truncated_tokens)
994
+ system_prompt = system_prompt.replace(all_content, truncated_content + "\n\n[Content truncated to fit token limits.]")
995
+ print(f"[DEBUG] Truncated content to fit token limits: {len(truncated_tokens)} tokens")
996
 
997
  try:
998
+ response = call_deepseek_api(
999
+ system_prompt=system_prompt,
1000
+ user_prompt="", # No additional user prompt
1001
+ max_tokens=3000, # Adjusted to allow more detailed output
1002
+ temperature=0.7
1003
+ )
1004
+ final_report = response.strip()
1005
+ # Optionally, check word count
1006
+ word_count = len(final_report.split())
1007
+ if word_count < 1000:
1008
+ print(f"[WARNING] Generated report is below desired length: {word_count} words.")
1009
+ return final_report
1010
+ except Exception as e:
1011
+ print("[ERROR] Could not finalize professional report:", e)
1012
+ return "An unexpected error occurred. Please try again later."
1013
 
1014
+ ###############################################################################
1015
+ # PDF Generation Function
1016
+ ###############################################################################
1017
+
1018
+ def generate_pdf_from_markdown(markdown_text: str) -> bytes:
1019
+ """
1020
+ Converts Markdown text to a PDF file.
1021
+
1022
+ Args:
1023
+ markdown_text (str): The Markdown content to convert.
1024
 
1025
+ Returns:
1026
+ bytes: The generated PDF file in bytes.
1027
+ """
1028
+ try:
1029
+ # Convert Markdown to HTML
1030
+ import markdown
1031
+ import pdfkit
1032
+ html = markdown.markdown(markdown_text)
1033
 
1034
+ # Generate PDF from HTML
1035
+ pdf_bytes = pdfkit.from_string(html, False) # False to return as bytes
 
1036
 
1037
+ return pdf_bytes
1038
+ except Exception as e:
1039
+ print(f"[ERROR] Failed to generate PDF from Markdown: {e}")
1040
+ return b""
1041
+
1042
+ ###############################################################################
1043
+ # Audio Mixing Function
1044
+ ###############################################################################
1045
 
1046
+ def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
1047
+ """
1048
+ Mixes spoken audio with background music.
1049
+
1050
+ Args:
1051
+ spoken (AudioSegment): The spoken audio segment.
1052
+ custom_music_path (str, optional): Path to custom background music. Defaults to None.
1053
+
1054
+ Returns:
1055
+ AudioSegment: The mixed audio segment.
1056
+ """
1057
+ # unchanged
1058
+ if custom_music_path:
1059
+ music_path = custom_music_path
1060
+ else:
1061
+ music_path = "bg_music.mp3"
1062
+
1063
+ if not os.path.exists(music_path):
1064
+ print(f"[ERROR] Background music file not found: {music_path}")
1065
+ return spoken # Return spoken audio without background music
1066
+
1067
+ try:
1068
+ bg_music = AudioSegment.from_file(music_path, format="mp3")
1069
  except Exception as e:
1070
+ print("[ERROR] Failed to load background music:", e)
1071
+ return spoken
1072
+
1073
+ bg_music = bg_music - 18.0
1074
+ total_length_ms = len(spoken) + 2000
1075
+ looped_music = AudioSegment.empty()
1076
+ while len(looped_music) < total_length_ms:
1077
+ looped_music += bg_music
1078
+ looped_music = looped_music[:total_length_ms]
1079
+ final_mix = looped_music.overlay(spoken, position=2000)
1080
+ return final_mix
1081
 
1082
  ###############################################################################
1083
+ # Generate Script Function and Helper
1084
  ###############################################################################
1085
 
1086
  def generate_script(
 
1150
  List[DialogueItem]: A list of DialogueItem objects.
1151
  """
1152
  # Define a regex pattern to identify lines like "HostName: Dialogue"
1153
+ pattern = rf"(?i)\b({re.escape(host_name)}|{re.escape(guest_name)})\b:\s*(.*)"
1154
  matches = re.findall(pattern, script_text)
1155
 
1156
  dialogue_items = []
 
1168
  # Additional Helper Functions (if any)
1169
  ###############################################################################
1170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1171
  def _spell_digits(d: str) -> str:
1172
  digit_map = {
1173
  '0': 'zero', '1': 'one', '2': 'two', '3': 'three',
 
1176
  }
1177
  return " ".join(digit_map[ch] for ch in d if ch in digit_map)
1178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1179
  ###############################################################################
1180
+ # End of utils.py
1181
  ###############################################################################