siddhartharyaai commited on
Commit
26794f8
·
verified ·
1 Parent(s): e6f486e

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +330 -785
utils.py CHANGED
@@ -1,59 +1,56 @@
1
  import os
2
- import json
3
  import re
 
4
  import requests
5
  import tempfile
6
- import time
7
- import logging
8
- from typing import List
9
- from pydantic import BaseModel
10
  from bs4 import BeautifulSoup
 
 
11
  from pydub import AudioSegment, effects
 
 
12
  import tiktoken
13
- import pypdf
14
- import markdown
15
- import pdfkit
16
  import random
17
- import warnings
18
- from cryptography.utils import CryptographyDeprecationWarning
19
- from ratelimit import limits, sleep_and_retry
20
- import streamlit as st
21
-
22
- # Suppress Cryptography Deprecation Warnings
23
- warnings.filterwarnings("ignore", category=CryptographyDeprecationWarning)
24
-
25
- # Configure Logging
26
- logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
27
 
28
- # Define Pydantic Models
 
 
29
  class DialogueItem(BaseModel):
30
- speaker: str
31
- display_speaker: str
32
  text: str
33
 
34
  class Dialogue(BaseModel):
35
  dialogue: List[DialogueItem]
36
 
37
- ###############################################################################
38
- # Helper Functions
39
- ###############################################################################
40
-
41
- def extract_text_from_pdf(pdf_path: str) -> str:
42
- print("[LOG] Extracting text from PDF:", pdf_path)
43
- try:
44
- reader = pypdf.PdfReader(pdf_path)
45
- text = ""
46
- for page_num, page in enumerate(reader.pages):
47
- page_text = page.extract_text()
48
- if page_text:
49
- text += page_text + "\n"
50
- print("[LOG] Text extraction from PDF successful.")
51
- return text
52
- except Exception as e:
53
- print(f"[ERROR] Failed to extract text from PDF: {e}")
54
- return ""
55
-
56
- def extract_text_from_url(url: str) -> str:
 
 
57
  print("[LOG] Extracting text from URL:", url)
58
  try:
59
  headers = {
@@ -77,120 +74,77 @@ def extract_text_from_url(url: str) -> str:
77
  print(f"[ERROR] Exception during text extraction from URL: {e}")
78
  return ""
79
 
80
- def fetch_wikipedia_summary(topic: str) -> str:
81
- print("[LOG] Fetching Wikipedia summary for:", topic)
82
- try:
83
- search_url = (
84
- f"https://en.wikipedia.org/w/api.php?action=opensearch&search={requests.utils.quote(topic)}"
85
- "&limit=1&namespace=0&format=json"
86
- )
87
- resp = requests.get(search_url)
88
- if resp.status_code != 200:
89
- print(f"[ERROR] Failed to fetch Wikipedia search results for {topic}")
90
- return ""
91
- data = resp.json()
92
- if len(data) > 1 and data[1]:
93
- title = data[1][0]
94
- summary_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(title)}"
95
- s_resp = requests.get(summary_url)
96
- if s_resp.status_code == 200:
97
- s_data = s_resp.json()
98
- if "extract" in s_data:
99
- print("[LOG] Wikipedia summary fetched successfully.")
100
- return s_data["extract"]
101
- return ""
102
- except Exception as e:
103
- print(f"[ERROR] Exception during Wikipedia summary fetch: {e}")
104
- return ""
105
 
106
- def fetch_rss_feed(feed_url: str) -> list:
107
- print("[LOG] Fetching RSS feed:", feed_url)
108
- try:
109
- resp = requests.get(feed_url)
110
- if resp.status_code != 200:
111
- print(f"[ERROR] Failed to fetch RSS feed: {feed_url}")
112
- return []
113
- soup = BeautifulSoup(resp.content, "xml")
114
- items = soup.find_all("item")
115
- return items
116
- except Exception as e:
117
- print(f"[ERROR] Exception fetching RSS feed {feed_url}: {e}")
118
- return []
119
 
120
- def find_relevant_article(items, topic: str, min_match=2) -> tuple:
121
- print("[LOG] Finding relevant articles...")
122
- keywords = re.findall(r'\w+', topic.lower())
123
- for item in items:
124
- title = item.find("title").get_text().strip() if item.find("title") else ""
125
- description = item.find("description").get_text().strip() if item.find("description") else ""
126
- text = (title + " " + description).lower()
127
- matches = sum(1 for kw in keywords if kw in text)
128
- if matches >= min_match:
129
- link = item.find("link").get_text().strip() if item.find("link") else ""
130
- print(f"[LOG] Relevant article found: {title}")
131
- return title, description, link
132
- return None, None, None
133
-
134
- def fetch_article_text(link: str) -> str:
135
- print("[LOG] Fetching article text from:", link)
136
- if not link:
137
- print("[LOG] No link provided for article text.")
138
  return ""
 
 
 
 
 
 
 
 
 
 
139
  try:
140
- resp = requests.get(link)
141
- if resp.status_code != 200:
142
- print(f"[ERROR] Failed to fetch article from {link}")
143
- return ""
144
- soup = BeautifulSoup(resp.text, 'html.parser')
145
- paragraphs = soup.find_all("p")
146
- text = " ".join(p.get_text() for p in paragraphs[:10]) # Fetch more paragraphs for depth
147
- print("[LOG] Article text fetched successfully.")
148
- return text.strip()
149
  except Exception as e:
150
- print(f"[ERROR] Error fetching article text: {e}")
151
- return ""
152
 
 
 
 
153
  def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
154
- """
155
- Query https://eventregistry.org/api/v1/article/getArticles
156
- with the env var NEWS_API_KEY, searching for 'topic'.
157
- Return list of {title, link, snippet}.
158
- """
159
  news_api_key = os.environ.get("NEWS_API_KEY")
160
  if not news_api_key:
161
  print("[ERROR] Missing NEWS_API_KEY for Event Registry.")
162
  return []
163
-
164
  print("[LOG] Attempting Event Registry for topic:", topic)
165
  endpoint = "https://eventregistry.org/api/v1/article/getArticles"
166
- # Minimal example request body
167
  body = {
168
  "action": "getArticles",
169
  "keyword": topic,
170
  "articlesPage": 1,
171
- "articlesCount": count, # up to 100, we do count=10 for uniformity
172
  "articlesSortBy": "date",
173
  "articlesSortByAsc": False,
174
  "dataType": ["news", "pr"],
175
- "forceMaxDataTimeWindow": 31, # last month
176
  "resultType": "articles",
177
  "apiKey": news_api_key
178
  }
179
-
180
  try:
181
  resp = requests.post(endpoint, json=body, timeout=20)
182
  resp.raise_for_status()
183
  data = resp.json()
184
- # According to docs, articles can be found at data["articles"]["results"]
185
  art_data = data.get("articles", {})
186
  results_arr = art_data.get("results", [])
187
-
188
  ret = []
189
  for item in results_arr:
190
- # item might have "title", "url", "body" or "titleUri"
191
  title = item.get("title", "")
192
  url = item.get("url", "")
193
- # pick either "body" or "excerpt"
194
  snippet = item.get("body", "") or item.get("excerpt", "")
195
  ret.append({"title": title, "link": url, "snippet": snippet})
196
  return ret
@@ -198,6 +152,9 @@ def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
198
  print("[ERROR] Event Registry approach failed:", e)
199
  return []
200
 
 
 
 
201
  def fetch_bing_results(query: str, count: int = 10) -> list:
202
  serp_api_key = os.environ.get("SERP_API_KEY")
203
  if not serp_api_key:
@@ -227,635 +184,11 @@ def fetch_bing_results(query: str, count: int = 10) -> list:
227
  print("[ERROR] Bing SerpApi approach failed:", e)
228
  return []
229
 
230
- ###############################################################################
231
- # Summarization Function
232
- ###############################################################################
233
-
234
- def summarize_text(text: str, max_length: int = 200) -> str:
235
- """
236
- Summarizes the given text to the specified maximum word length.
237
-
238
- Args:
239
- text (str): The text to summarize.
240
- max_length (int): The maximum number of words in the summary.
241
-
242
- Returns:
243
- str: The summarized text.
244
- """
245
- system_prompt = (
246
- f"You are a professional summarizer. Please condense the following text "
247
- f"into a summary of no more than {max_length} words, retaining the main ideas and key details."
248
- )
249
- user_prompt = text
250
-
251
- try:
252
- summary = call_deepseek_api_cached(
253
- system_prompt=system_prompt,
254
- user_prompt=user_prompt,
255
- max_tokens=500, # Adjust as needed
256
- temperature=0.5
257
- )
258
- return summary.strip()
259
- except Exception as e:
260
- print(f"[ERROR] Summarization failed: {e}")
261
- # Fallback: return the original text truncated to max_length words
262
- return " ".join(text.split()[:max_length]) + "..."
263
-
264
- ###############################################################################
265
- # Rewrites text in professional style
266
- ###############################################################################
267
-
268
- def rewrite_in_professional_style(topic: str, raw_text: str) -> str:
269
- if not raw_text.strip():
270
- return ""
271
-
272
- system_prompt = (
273
- "You are a professional writing assistant. Your goal is to rewrite "
274
- "the provided text so that it is:\n"
275
- "1) Written in clear, fluent, professional English\n"
276
- f"2) On-topic about {topic}, removing any extraneous disclaimers or filler\n"
277
- "3) Organized in paragraphs or bullet points\n"
278
- "4) Maintained or slightly enhanced in detail without significant summarization\n"
279
- "5) No references to the rewriting process or disclaimers\n"
280
- )
281
-
282
- user_prompt = f"Please rewrite this text:\n\n{raw_text}"
283
-
284
- try:
285
- rewritten = call_deepseek_api_cached(
286
- system_prompt=system_prompt,
287
- user_prompt=user_prompt,
288
- max_tokens=1024,
289
- temperature=0.7
290
- )
291
- # Optionally, summarize the rewritten text to further reduce token count
292
- summary = summarize_text(rewritten, max_length=150)
293
- return summary
294
- except Exception as e:
295
- print("[ERROR] Rewriting text via Deepseek LLM failed:", e)
296
- return raw_text
297
-
298
- ###############################################################################
299
- # OpenRouter API Communication Function with Exponential Backoff and Rate Limiting
300
- ###############################################################################
301
-
302
- ONE_MINUTE = 60
303
-
304
- @sleep_and_retry
305
- @limits(calls=5, period=ONE_MINUTE) # Adjust based on OpenRouter's rate limits
306
- def call_deepseek_api(system_prompt: str, user_prompt: str, max_tokens: int, temperature: float, max_retries: int = 5) -> str:
307
- """
308
- Function to call DeepSeek R1 via OpenRouter API with exponential backoff for rate limiting.
309
- """
310
- logging.info("Communicating with DeepSeek R1 via OpenRouter API.")
311
- headers = {
312
- "Authorization": f"Bearer {os.environ.get('DEEPSEEK_API_KEY')}",
313
- "Content-Type": "application/json",
314
- # Optional headers for OpenRouter leaderboard
315
- # "HTTP-Referer": "<YOUR_SITE_URL>",
316
- # "X-Title": "<YOUR_SITE_NAME>",
317
- }
318
- data = {
319
- "model": "deepseek/deepseek-r1:free", # Ensure this model name is correct
320
- "messages": [
321
- {"role": "system", "content": system_prompt},
322
- {"role": "user", "content": user_prompt}
323
- ],
324
- "max_tokens": max_tokens,
325
- "temperature": temperature
326
- }
327
-
328
- attempt = 0
329
- backoff_time = 1 # Start with 1 second
330
-
331
- while attempt < max_retries:
332
- try:
333
- response = requests.post("https://openrouter.ai/api/v1/chat/completions",
334
- headers=headers, data=json.dumps(data))
335
-
336
- logging.debug(f"OpenRouter API Response Status: {response.status_code}")
337
- logging.debug(f"OpenRouter API Response Body: {response.text}")
338
-
339
- if response.status_code == 200:
340
- response_json = response.json()
341
- if "choices" in response_json and response_json["choices"]:
342
- return response_json["choices"][0]["message"]["content"]
343
- else:
344
- logging.error("'choices' key missing in OpenRouter API response.")
345
- raise ValueError("Invalid response from OpenRouter API: 'choices' key missing.")
346
- elif response.status_code == 429:
347
- # Rate limit exceeded
348
- retry_after = response.headers.get("Retry-After")
349
- if retry_after:
350
- wait_time = int(retry_after)
351
- else:
352
- wait_time = backoff_time
353
- logging.warning(f"Rate limit exceeded. Attempt {attempt + 1} of {max_retries}. Retrying in {wait_time} seconds...")
354
- time.sleep(wait_time)
355
- backoff_time *= 2 # Exponential backoff
356
- attempt += 1
357
- else:
358
- # Handle other HTTP errors
359
- try:
360
- error_message = response.json().get("error", {}).get("message", "Unknown error")
361
- except json.JSONDecodeError:
362
- error_message = "Non-JSON response received."
363
- logging.error(f"OpenRouter API error: {response.status_code} - {error_message}")
364
- raise ValueError(f"OpenRouter API error: {response.status_code} - {error_message}")
365
-
366
- except requests.exceptions.RequestException as e:
367
- logging.error(f"Request exception: {e}. Attempt {attempt + 1} of {max_retries}. Retrying in {backoff_time} seconds...")
368
- time.sleep(backoff_time)
369
- backoff_time *= 2
370
- attempt += 1
371
-
372
- # After max retries
373
- logging.error("Max retries exceeded. Failed to get a valid response from OpenRouter API.")
374
- raise ValueError("Rate limit exceeded. Please try again later.")
375
-
376
- @st.cache_data(show_spinner=False)
377
- def call_deepseek_api_cached(system_prompt: str, user_prompt: str, max_tokens: int, temperature: float) -> str:
378
- return call_deepseek_api(system_prompt, user_prompt, max_tokens, temperature)
379
-
380
- ###############################################################################
381
- # Professional Report Drafting Function
382
- ###############################################################################
383
-
384
- def _draft_professional_report(topic: str, sources_list: list) -> str:
385
- """
386
- Build a concise professional report:
387
- - Title
388
- - Executive Summary
389
- - Introduction
390
- - Main Body with sub-headings
391
- - Analysis
392
- - Conclusion
393
- - References footnotes
394
- Ensures at least ~1000 words.
395
-
396
- Args:
397
- topic (str): The research topic.
398
- sources_list (list): List of summarized sources.
399
-
400
- Returns:
401
- str: The final professional report in Markdown format.
402
- """
403
- merged_text = []
404
- footnotes = []
405
- for s in sources_list:
406
- footnotes.append(f"[^{s['index']}]: {s['link']}")
407
- text_block = (
408
- f"Source {s['index']} Title: {s['title']}\n"
409
- f"FootnoteRef: [^{s['index']}]\n"
410
- f"Text:\n{s['cleaned_text']}\n"
411
- )
412
- merged_text.append(text_block)
413
- all_content = "\n\n".join(merged_text)
414
-
415
- # Build the system prompt
416
- system_prompt = f"""You are a highly skilled professional research analyst.
417
- You are tasked with creating a comprehensive and detailed formal research report on the topic: {topic}.
418
-
419
- **Report Structure:**
420
- 1. **Title:** {topic}
421
- 2. **Executive Summary:** A concise overview of key findings and insights.
422
- 3. **Introduction:** Introduce the topic, its relevance, and the scope of the report.
423
- 4. **Main Body:**
424
- - **Section 1:** Insights from Source 1.
425
- - **Section 2:** Insights from Source 2.
426
- - *(Continue as needed)*
427
- - **Analysis:** An in-depth analysis combining information from all sources.
428
- 5. **Conclusion:** Final thoughts, implications, and potential future directions.
429
- 6. **References:** List all sources with numeric footnote markers [^1], [^2], etc.
430
-
431
- **Requirements:**
432
- - **Length:** At least 1,000 words.
433
- - **Content Quality:**
434
- - Incorporate relevant facts, figures, and statistics.
435
- - Use professional and clear language.
436
- - Ensure each section is well-developed without unnecessary repetition.
437
- - **Structure:** Logical and cohesive flow throughout the report.
438
- - **Formatting:** Proper formatting for headings, sub-headings, and references.
439
-
440
- **Aggregated Content from Sources:**
441
- -----------------------------------------------------------------------
442
- {all_content}
443
- -----------------------------------------------------------------------
444
- **Footnotes:**
445
- {chr(10).join(footnotes)}
446
- """
447
-
448
- # Token Counting Function
449
- def count_tokens(text: str) -> int:
450
- tokenizer = tiktoken.get_encoding("cl100k_base")
451
- tokens = tokenizer.encode(text)
452
- return len(tokens)
453
-
454
- # Calculate token counts
455
- max_tokens = 6000 # OpenRouter's token limit
456
- system_prompt_tokens = count_tokens(system_prompt)
457
-
458
- logging.debug(f"Total tokens before optimization: {system_prompt_tokens}")
459
-
460
- if system_prompt_tokens > max_tokens:
461
- # Calculate allowed tokens for all_content
462
- allowed_tokens_for_content = max_tokens - system_prompt_tokens - 100 # Reserve 100 tokens buffer
463
- if allowed_tokens_for_content <= 0:
464
- logging.error("System prompt alone exceeds the token limit.")
465
- return "The system prompt exceeds the token limit. Please reduce the complexity of your research."
466
-
467
- # Truncate all_content to fit
468
- tokenizer = tiktoken.get_encoding("cl100k_base")
469
- all_content_tokens_list = tokenizer.encode(all_content)
470
- truncated_tokens = all_content_tokens_list[:allowed_tokens_for_content]
471
- truncated_content = tokenizer.decode(truncated_tokens)
472
- system_prompt = system_prompt.replace(all_content, truncated_content + "\n\n[Content truncated to fit token limits.]")
473
- logging.debug(f"Truncated content to fit token limits: {len(truncated_tokens)} tokens")
474
-
475
- try:
476
- response = call_deepseek_api_cached(
477
- system_prompt=system_prompt,
478
- user_prompt="", # No additional user prompt
479
- max_tokens=3000, # Adjusted to allow more detailed output
480
- temperature=0.7
481
- )
482
- final_report = response.strip()
483
- # Optionally, check word count
484
- word_count = len(final_report.split())
485
- if word_count < 1000:
486
- logging.warning(f"Generated report is below desired length: {word_count} words.")
487
- return final_report
488
- except Exception as e:
489
- logging.error(f"Could not finalize professional report: {e}")
490
- return "An unexpected error occurred. Please try again later."
491
-
492
- ###############################################################################
493
- # PDF Generation Function
494
- ###############################################################################
495
-
496
- def generate_pdf_from_markdown(markdown_text: str) -> bytes:
497
- """
498
- Converts Markdown text to a PDF file.
499
-
500
- Args:
501
- markdown_text (str): The Markdown content to convert.
502
-
503
- Returns:
504
- bytes: The generated PDF file in bytes.
505
- """
506
- try:
507
- # Convert Markdown to HTML
508
- html = markdown.markdown(markdown_text)
509
-
510
- # Generate PDF from HTML
511
- pdf_bytes = pdfkit.from_string(html, False) # False to return as bytes
512
-
513
- return pdf_bytes
514
- except Exception as e:
515
- print(f"[ERROR] Failed to generate PDF from Markdown: {e}")
516
- return b""
517
-
518
- ###############################################################################
519
- # Audio Mixing Function
520
- ###############################################################################
521
-
522
- def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
523
- """
524
- Mixes spoken audio with background music.
525
-
526
- Args:
527
- spoken (AudioSegment): The spoken audio segment.
528
- custom_music_path (str, optional): Path to custom background music. Defaults to None.
529
-
530
- Returns:
531
- AudioSegment: The mixed audio segment.
532
- """
533
- if custom_music_path:
534
- music_path = custom_music_path
535
- else:
536
- music_path = "bg_music.mp3"
537
-
538
- if not os.path.exists(music_path):
539
- print(f"[ERROR] Background music file not found: {music_path}")
540
- return spoken # Return spoken audio without background music
541
-
542
- try:
543
- bg_music = AudioSegment.from_file(music_path, format="mp3")
544
- except Exception as e:
545
- print("[ERROR] Failed to load background music:", e)
546
- return spoken
547
-
548
- bg_music = bg_music - 18.0
549
- total_length_ms = len(spoken) + 2000
550
- looped_music = AudioSegment.empty()
551
- while len(looped_music) < total_length_ms:
552
- looped_music += bg_music
553
- looped_music = looped_music[:total_length_ms]
554
- final_mix = looped_music.overlay(spoken, position=2000)
555
- return final_mix
556
-
557
- ###############################################################################
558
- # Generate Script Function and Helper
559
- ###############################################################################
560
-
561
- def generate_script(
562
- system_prompt: str,
563
- input_text: str,
564
- tone: str,
565
- target_length: str,
566
- host_name: str = "Jane",
567
- guest_name: str = "John",
568
- sponsor_style: str = "Separate Break",
569
- sponsor_provided: bool = False
570
- ) -> Dialogue:
571
- """
572
- Generates a podcast script using DeepSeek R1 via OpenRouter API.
573
-
574
- Args:
575
- system_prompt (str): System-level instructions for the LLM.
576
- input_text (str): The main content or topic for the podcast.
577
- tone (str): Desired tone of the podcast (e.g., Casual, Formal).
578
- target_length (str): Desired length of the podcast (e.g., "3 Mins").
579
- host_name (str, optional): Name of the host. Defaults to "Jane".
580
- guest_name (str, optional): Name of the guest. Defaults to "John".
581
- sponsor_style (str, optional): Style of sponsor integration. Defaults to "Separate Break".
582
- sponsor_provided (bool, optional): Whether sponsor content is provided. Defaults to False.
583
-
584
- Returns:
585
- Dialogue: A Dialogue object containing dialogue items.
586
- """
587
- # Build the user prompt with additional instructions
588
- user_prompt = (
589
- f"Topic: {input_text}\n"
590
- f"Tone: {tone}\n"
591
- f"Length: {target_length}\n"
592
- f"Host: {host_name or 'Jane'}\n"
593
- f"Guest: {guest_name or 'John'}\n"
594
- )
595
- if sponsor_provided:
596
- user_prompt += f"Sponsor Style: {sponsor_style}\n"
597
-
598
- # Call the DeepSeek API to generate the script
599
- try:
600
- response = call_deepseek_api_cached(
601
- system_prompt=system_prompt,
602
- user_prompt=user_prompt,
603
- max_tokens=1500,
604
- temperature=0.7
605
- )
606
- except Exception as e:
607
- print(f"[ERROR] Failed to generate script: {e}")
608
- raise
609
-
610
- # Parse the response into DialogueItems
611
- dialogue_items = parse_script_to_dialogue(response, host_name, guest_name)
612
-
613
- return Dialogue(dialogue=dialogue_items)
614
-
615
- def parse_script_to_dialogue(script_text: str, host_name: str, guest_name: str) -> List[DialogueItem]:
616
- """
617
- Parses the script text into a list of DialogueItem objects.
618
-
619
- Args:
620
- script_text (str): The raw script text generated by the LLM.
621
- host_name (str): Name of the host.
622
- guest_name (str): Name of the guest.
623
-
624
- Returns:
625
- List[DialogueItem]: A list of DialogueItem objects.
626
- """
627
- # Define a regex pattern to identify lines like "HostName: Dialogue"
628
- pattern = rf"(?i)\b({re.escape(host_name)}|{re.escape(guest_name)})\b:\s*(.*)"
629
- matches = re.findall(pattern, script_text)
630
-
631
- dialogue_items = []
632
- for speaker, text in matches:
633
- speaker_normalized = "Jane" if speaker.lower() == host_name.lower() else "John"
634
- item = DialogueItem(
635
- speaker=speaker_normalized,
636
- display_speaker=speaker,
637
- text=text.strip()
638
- )
639
- dialogue_items.append(item)
640
- return dialogue_items
641
-
642
- ###############################################################################
643
- # Generate Audio MP3 Function
644
- ###############################################################################
645
-
646
- def generate_audio_mp3(text: str, speaker: str) -> str:
647
- """
648
- Generates and returns the actual MP3 file path.
649
- Utilizes Deepgram for English (American) and Murf for other languages.
650
-
651
- Args:
652
- text (str): The text to convert to speech.
653
- speaker (str): The speaker identifier (e.g., "John", "Jane").
654
-
655
- Returns:
656
- str: The file path to the generated MP3 audio.
657
- """
658
- try:
659
- import streamlit as st
660
- print(f"[LOG] Generating audio for speaker: {speaker}")
661
- language_selection = st.session_state.get("language_selection", "English (American)")
662
-
663
- if language_selection == "English (American)":
664
- print("[LOG] Using Deepgram TTS for English (American)")
665
- # Process text if speaker is not Jane
666
- if speaker in ["John", "Jane"]:
667
- processed_text = text
668
- else:
669
- processed_text = _preprocess_text_for_tts(text, speaker)
670
-
671
- deepgram_api_url = "https://api.deepgram.com/v1/speak"
672
- params = {"model": "aura-asteria-en"}
673
- if speaker == "John":
674
- params["model"] = "aura-zeus-en"
675
-
676
- headers = {
677
- "Accept": "audio/mpeg",
678
- "Content-Type": "application/json",
679
- "Authorization": f"Token {os.environ.get('DEEPSEEK_API_KEY')}"
680
- }
681
- body = {"text": processed_text}
682
- response = requests.post(deepgram_api_url, params=params, headers=headers, json=body, stream=True)
683
- if response.status_code != 200:
684
- raise ValueError(f"Deepgram TTS error: {response.status_code}, {response.text}")
685
-
686
- content_type = response.headers.get('Content-Type', '')
687
- if 'audio/mpeg' not in content_type:
688
- raise ValueError("Unexpected Content-Type from Deepgram.")
689
-
690
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as mp3_file:
691
- for chunk in response.iter_content(chunk_size=8192):
692
- if chunk:
693
- mp3_file.write(chunk)
694
- mp3_path = mp3_file.name
695
-
696
- if not os.path.exists(mp3_path):
697
- raise FileNotFoundError(f"Deepgram did not create the MP3 file: {mp3_path}")
698
-
699
- audio_seg = AudioSegment.from_file(mp3_path, format="mp3")
700
- audio_seg = effects.normalize(audio_seg)
701
- final_mp3_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
702
- audio_seg.export(final_mp3_path, format="mp3")
703
- if os.path.exists(mp3_path):
704
- os.remove(mp3_path)
705
-
706
- print(f"[DEBUG] Generated MP3 Path: {final_mp3_path}")
707
- if not os.path.exists(final_mp3_path):
708
- raise FileNotFoundError(f"Final MP3 file was not created: {final_mp3_path}")
709
-
710
- return final_mp3_path
711
-
712
- else:
713
- print(f"[LOG] Using Murf API for language: {language_selection}")
714
- # Process text if language is Hinglish or Hindi
715
- if language_selection == "Hinglish":
716
- from indic_transliteration.sanscript import transliterate, DEVANAGARI, IAST
717
- text = transliterate(text, DEVANAGARI, IAST)
718
-
719
- api_key = os.environ.get("MURF_API_KEY")
720
- headers = {
721
- "Content-Type": "application/json",
722
- "Accept": "application/json",
723
- "api-key": api_key
724
- }
725
- multi_native_locale = "hi-IN" if language_selection in ["Hinglish", "Hindi"] else "en-IN"
726
- if language_selection == "English (Indian)":
727
- voice_id = "en-IN-aarav" if speaker == "John" else "en-IN-isha"
728
- elif language_selection in ["Hindi", "Hinglish"]:
729
- voice_id = "hi-IN-kabir" if speaker == "John" else "hi-IN-shweta"
730
- else:
731
- voice_id = "en-IN-aarav" if speaker == "John" else "en-IN-isha"
732
-
733
- payload = {
734
- "audioDuration": 0,
735
- "channelType": "MONO",
736
- "encodeAsBase64": False,
737
- "format": "WAV",
738
- "modelVersion": "GEN2",
739
- "multiNativeLocale": multi_native_locale,
740
- "pitch": 0,
741
- "pronunciationDictionary": {},
742
- "rate": 0,
743
- "sampleRate": 48000,
744
- "style": "Conversational",
745
- "text": text,
746
- "variation": 1,
747
- "voiceId": voice_id
748
- }
749
- response = requests.post("https://api.murf.ai/v1/speech/generate", headers=headers, json=payload)
750
- if response.status_code != 200:
751
- raise ValueError(f"Murf API error: {response.status_code}, {response.text}")
752
-
753
- json_resp = response.json()
754
- audio_url = json_resp.get("audioFile")
755
- if not audio_url:
756
- raise ValueError("No audio file URL returned by Murf API")
757
-
758
- audio_response = requests.get(audio_url)
759
- if audio_response.status_code != 200:
760
- raise ValueError(f"Error fetching audio from {audio_url}")
761
-
762
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as wav_file:
763
- wav_file.write(audio_response.content)
764
- wav_path = wav_file.name
765
-
766
- if not os.path.exists(wav_path):
767
- raise FileNotFoundError(f"Murf did not create the WAV file: {wav_path}")
768
-
769
- audio_seg = AudioSegment.from_file(wav_path, format="wav")
770
- audio_seg = effects.normalize(audio_seg)
771
- final_mp3_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
772
- audio_seg.export(final_mp3_path, format="mp3")
773
- os.remove(wav_path)
774
-
775
- if not os.path.exists(final_mp3_path):
776
- raise FileNotFoundError(f"Final MP3 file was not created: {final_mp3_path}")
777
-
778
- print(f"[DEBUG] Generated MP3 Path: {final_mp3_path}")
779
- return final_mp3_path
780
-
781
- def _preprocess_text_for_tts(text: str, speaker: str) -> str:
782
- """
783
- Preprocesses text for Text-to-Speech conversion by adding pauses, fillers,
784
- and handling specific cases to make the speech sound more natural.
785
-
786
- Args:
787
- text (str): The original text to preprocess.
788
- speaker (str): The speaker identifier (e.g., "John", "Jane").
789
-
790
- Returns:
791
- str: The preprocessed text.
792
- """
793
- # Unchanged logic for adding filler words, etc.
794
- text = re.sub(r"\bNo\.\b", "Number", text)
795
- text = re.sub(r"\b(?i)SaaS\b", "sass", text)
796
- abbreviations_as_words = {"NASA", "NATO", "UNESCO"}
797
-
798
- def insert_periods_for_abbrev(m):
799
- abbr = m.group(0)
800
- if abbr in abbreviations_as_words:
801
- return abbr
802
- return ".".join(list(abbr)) + "."
803
-
804
- text = re.sub(r"\b([A-Z]{2,})\b", insert_periods_for_abbrev, text)
805
- text = re.sub(r"\.\.", ".", text)
806
-
807
- def remove_periods_for_tts(m):
808
- return m.group().replace(".", " ").strip()
809
-
810
- text = re.sub(r"[A-Z]\.[A-Z](?:\.[A-Z])*\.", remove_periods_for_tts, text)
811
- text = re.sub(r"-", " ", text)
812
- text = re.sub(r"\b(ha(ha)?|heh|lol)\b", "(* laughs *)", text, flags=re.IGNORECASE)
813
- text = re.sub(r"\bsigh\b", "(* sighs *)", text, flags=re.IGNORECASE)
814
- text = re.sub(r"\b(groan|moan)\b", "(* groans *)", text, flags=re.IGNORECASE)
815
-
816
- if speaker != "Jane":
817
- def insert_thinking_pause(m):
818
- word = m.group(1)
819
- if random.random() < 0.3:
820
- filler = random.choice(['hmm,', 'well,', 'let me see,'])
821
- return f"{word}..., {filler}"
822
- else:
823
- return f"{word}...,"
824
-
825
- keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
826
- text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
827
-
828
- conj_pattern = r"\b(and|but|so|because|however)\b"
829
- text = re.sub(conj_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
830
-
831
- text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
832
-
833
- def capitalize_match(m):
834
- return m.group().upper()
835
-
836
- text = re.sub(r'(^\s*\w)|([.!?]\s*\w)', capitalize_match, text)
837
- return text.strip()
838
-
839
- ###############################################################################
840
- # Unified aggregator: google + bing + wiki + rss + event registry + fallback
841
- ###############################################################################
842
-
843
  def perform_deep_research(topic: str) -> str:
844
- """
845
- Perform deep research by aggregating data from multiple sources.
846
- Limits the number of sources to prevent exceeding token limits.
847
- Summarizes each source's content to reduce token count.
848
-
849
- Args:
850
- topic (str): The research topic.
851
-
852
- Returns:
853
- str: The final professional report in Markdown format.
854
- """
855
- # Define the maximum number of sources per aggregator
856
- MAX_SOURCES_PER_AGGREGATOR = 5
857
-
858
- # Step 1: Google
859
  google_cse_id = os.environ.get("GOOGLE_CSE_ID")
860
  google_api_key = os.environ.get("GOOGLE_API_KEY")
861
  google_sources = []
@@ -867,12 +200,12 @@ def perform_deep_research(topic: str) -> str:
867
  "q": topic,
868
  "cx": google_cse_id,
869
  "key": google_api_key,
870
- "num": 10 # Fetch more to account for filtering
871
  }
872
  resp = requests.get(url, params=params, timeout=15)
873
  resp.raise_for_status()
874
  data = resp.json()
875
- items = data.get("items", [])[:MAX_SOURCES_PER_AGGREGATOR]
876
  for it in items:
877
  google_sources.append({
878
  "title": it.get("title", ""),
@@ -881,11 +214,7 @@ def perform_deep_research(topic: str) -> str:
881
  })
882
  except Exception as e:
883
  print("[ERROR] Google approach failed:", e)
884
-
885
- # Step 2: Bing
886
- bing_results = fetch_bing_results(topic, count=10)[:MAX_SOURCES_PER_AGGREGATOR]
887
-
888
- # Step 3: Wikipedia summary
889
  wiki_summary_text = fetch_wikipedia_summary(topic)
890
  wiki_item = None
891
  if wiki_summary_text:
@@ -894,9 +223,6 @@ def perform_deep_research(topic: str) -> str:
894
  "link": f"https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}",
895
  "snippet": wiki_summary_text
896
  }
897
-
898
- # Step 4: RSS approach (NewsAPI assumed here)
899
- rss_sources = []
900
  sources_dict = {
901
  "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
902
  "CNN": "http://rss.cnn.com/rss/edition.rss",
@@ -906,7 +232,8 @@ def perform_deep_research(topic: str) -> str:
906
  "The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
907
  "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
908
  }
909
- for name, feed_url in list(sources_dict.items())[:MAX_SOURCES_PER_AGGREGATOR]:
 
910
  try:
911
  items = fetch_rss_feed(feed_url)
912
  if not items:
@@ -929,11 +256,7 @@ def perform_deep_research(topic: str) -> str:
929
  except Exception as e:
930
  print(f"[ERROR] Error fetching from {name} RSS feed:", e)
931
  continue
932
-
933
- # Step 5: Event Registry
934
- event_registry_res = fetch_eventregistry_articles(topic, count=10)[:MAX_SOURCES_PER_AGGREGATOR]
935
-
936
- # Combine all sources
937
  combined = []
938
  combined.extend(google_sources)
939
  combined.extend(bing_results)
@@ -941,10 +264,8 @@ def perform_deep_research(topic: str) -> str:
941
  combined.append(wiki_item)
942
  combined.extend(rss_sources)
943
  combined.extend(event_registry_res)
944
-
945
  if not combined:
946
  print("[LOG] No results found from aggregator. Using LLM fallback.")
947
- # LLM-based fallback
948
  fallback_text = query_llm_for_additional_info(topic, "")
949
  cleaned_fb = rewrite_in_professional_style(topic, fallback_text)
950
  fallback_data = [{
@@ -955,22 +276,231 @@ def perform_deep_research(topic: str) -> str:
955
  }]
956
  return _draft_professional_report(topic, fallback_data)
957
  else:
958
- # Summarize each source's snippet to reduce token count
959
- summarized_list = []
960
- for idx, source in enumerate(combined, start=1):
961
- summary = summarize_text(source["snippet"], max_length=200) # Summarize to 200 words
962
- summarized_list.append({
963
- "index": idx,
964
- "title": source["title"],
965
- "link": source["link"],
966
- "cleaned_text": summary
967
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
968
 
969
- return _draft_professional_report(topic, summarized_list)
 
 
 
 
 
 
 
 
 
 
 
 
970
 
971
- ###############################################################################
972
- # Additional Helper Functions (if any)
973
- ###############################################################################
 
 
 
 
 
 
 
 
 
 
 
 
974
 
975
  def _spell_digits(d: str) -> str:
976
  digit_map = {
@@ -980,6 +510,21 @@ def _spell_digits(d: str) -> str:
980
  }
981
  return " ".join(digit_map[ch] for ch in d if ch in digit_map)
982
 
983
- ###############################################################################
984
- # End of utils.py
985
- ###############################################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
2
  import re
3
+ import json
4
  import requests
5
  import tempfile
 
 
 
 
6
  from bs4 import BeautifulSoup
7
+ from typing import List, Literal
8
+ from pydantic import BaseModel
9
  from pydub import AudioSegment, effects
10
+ from transformers import pipeline
11
+ import yt_dlp
12
  import tiktoken
13
+ import numpy as np
14
+ import torch
 
15
  import random
16
+ import base64
17
+ from io import BytesIO
18
+ import pdfkit
19
+ import markdown # For Markdown to HTML conversion
 
 
 
 
 
 
20
 
21
+ # ------------------------------
22
+ # Data models
23
+ # ------------------------------
24
  class DialogueItem(BaseModel):
25
+ speaker: Literal["Jane", "John"]
26
+ display_speaker: str = "Jane"
27
  text: str
28
 
29
  class Dialogue(BaseModel):
30
  dialogue: List[DialogueItem]
31
 
32
+ # ------------------------------
33
+ # ASR Pipeline setup
34
+ # ------------------------------
35
+ asr_pipeline = pipeline(
36
+ "automatic-speech-recognition",
37
+ model="openai/whisper-tiny.en",
38
+ device=0 if torch.cuda.is_available() else -1
39
+ )
40
+
41
+ # ------------------------------
42
+ # Helper functions
43
+ # ------------------------------
44
+ def truncate_text(text, max_tokens=2048):
45
+ print("[LOG] Truncating text if needed.")
46
+ tokenizer = tiktoken.get_encoding("cl100k_base")
47
+ tokens = tokenizer.encode(text)
48
+ if len(tokens) > max_tokens:
49
+ print("[LOG] Text too long, truncating.")
50
+ return tokenizer.decode(tokens[:max_tokens])
51
+ return text
52
+
53
+ def extract_text_from_url(url):
54
  print("[LOG] Extracting text from URL:", url)
55
  try:
56
  headers = {
 
74
  print(f"[ERROR] Exception during text extraction from URL: {e}")
75
  return ""
76
 
77
+ def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
78
+ print(f"[LOG] Shifting pitch by {semitones} semitones.")
79
+ new_sample_rate = int(audio.frame_rate * (2.0 ** (semitones / 12.0)))
80
+ shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
81
+ return shifted_audio.set_frame_rate(audio.frame_rate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ def is_sufficient(text: str, min_word_count: int = 500) -> bool:
84
+ word_count = len(text.split())
85
+ print(f"[DEBUG] Aggregated word count: {word_count}")
86
+ return word_count >= min_word_count
 
 
 
 
 
 
 
 
 
87
 
88
+ # ------------------------------
89
+ # Text rewriting using DeepSeek (via OpenRouter)
90
+ # ------------------------------
91
+ def rewrite_in_professional_style(topic: str, raw_text: str) -> str:
92
+ if not raw_text.strip():
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  return ""
94
+ system_prompt = (
95
+ "You are a professional writing assistant. Your goal is to rewrite "
96
+ "the provided text so that it is:\n"
97
+ "1) Written in clear, fluent, professional English\n"
98
+ f"2) On-topic about {topic}, removing any extraneous disclaimers or filler\n"
99
+ "3) Organized in paragraphs or bullet points\n"
100
+ "4) Maintained or slightly enhanced in detail without significant summarization\n"
101
+ "5) No references to the rewriting process or disclaimers\n"
102
+ )
103
+ user_prompt = f"Please rewrite this text:\n\n{raw_text}"
104
  try:
105
+ response = call_deepseek_api(
106
+ system_prompt=system_prompt,
107
+ user_prompt=user_prompt,
108
+ max_tokens=1024,
109
+ temperature=0.7
110
+ )
111
+ return response.strip()
 
 
112
  except Exception as e:
113
+ print("[ERROR] rewriting text via Deepseek LLM failed:", e)
114
+ return raw_text
115
 
116
+ # ------------------------------
117
+ # Event Registry aggregator
118
+ # ------------------------------
119
  def fetch_eventregistry_articles(topic: str, count: int = 10) -> list:
 
 
 
 
 
120
  news_api_key = os.environ.get("NEWS_API_KEY")
121
  if not news_api_key:
122
  print("[ERROR] Missing NEWS_API_KEY for Event Registry.")
123
  return []
 
124
  print("[LOG] Attempting Event Registry for topic:", topic)
125
  endpoint = "https://eventregistry.org/api/v1/article/getArticles"
 
126
  body = {
127
  "action": "getArticles",
128
  "keyword": topic,
129
  "articlesPage": 1,
130
+ "articlesCount": count,
131
  "articlesSortBy": "date",
132
  "articlesSortByAsc": False,
133
  "dataType": ["news", "pr"],
134
+ "forceMaxDataTimeWindow": 31,
135
  "resultType": "articles",
136
  "apiKey": news_api_key
137
  }
 
138
  try:
139
  resp = requests.post(endpoint, json=body, timeout=20)
140
  resp.raise_for_status()
141
  data = resp.json()
 
142
  art_data = data.get("articles", {})
143
  results_arr = art_data.get("results", [])
 
144
  ret = []
145
  for item in results_arr:
 
146
  title = item.get("title", "")
147
  url = item.get("url", "")
 
148
  snippet = item.get("body", "") or item.get("excerpt", "")
149
  ret.append({"title": title, "link": url, "snippet": snippet})
150
  return ret
 
152
  print("[ERROR] Event Registry approach failed:", e)
153
  return []
154
 
155
+ # ------------------------------
156
+ # Bing results via SerpApi
157
+ # ------------------------------
158
  def fetch_bing_results(query: str, count: int = 10) -> list:
159
  serp_api_key = os.environ.get("SERP_API_KEY")
160
  if not serp_api_key:
 
184
  print("[ERROR] Bing SerpApi approach failed:", e)
185
  return []
186
 
187
+ # ------------------------------
188
+ # Unified deep research aggregator
189
+ # ------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  def perform_deep_research(topic: str) -> str:
191
+ # Limit each source to a maximum of 5 items
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  google_cse_id = os.environ.get("GOOGLE_CSE_ID")
193
  google_api_key = os.environ.get("GOOGLE_API_KEY")
194
  google_sources = []
 
200
  "q": topic,
201
  "cx": google_cse_id,
202
  "key": google_api_key,
203
+ "num": 5
204
  }
205
  resp = requests.get(url, params=params, timeout=15)
206
  resp.raise_for_status()
207
  data = resp.json()
208
+ items = data.get("items", [])
209
  for it in items:
210
  google_sources.append({
211
  "title": it.get("title", ""),
 
214
  })
215
  except Exception as e:
216
  print("[ERROR] Google approach failed:", e)
217
+ bing_results = fetch_bing_results(topic, count=5)
 
 
 
 
218
  wiki_summary_text = fetch_wikipedia_summary(topic)
219
  wiki_item = None
220
  if wiki_summary_text:
 
223
  "link": f"https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}",
224
  "snippet": wiki_summary_text
225
  }
 
 
 
226
  sources_dict = {
227
  "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
228
  "CNN": "http://rss.cnn.com/rss/edition.rss",
 
232
  "The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
233
  "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
234
  }
235
+ rss_sources = []
236
+ for name, feed_url in sources_dict.items():
237
  try:
238
  items = fetch_rss_feed(feed_url)
239
  if not items:
 
256
  except Exception as e:
257
  print(f"[ERROR] Error fetching from {name} RSS feed:", e)
258
  continue
259
+ event_registry_res = fetch_eventregistry_articles(topic, count=5)
 
 
 
 
260
  combined = []
261
  combined.extend(google_sources)
262
  combined.extend(bing_results)
 
264
  combined.append(wiki_item)
265
  combined.extend(rss_sources)
266
  combined.extend(event_registry_res)
 
267
  if not combined:
268
  print("[LOG] No results found from aggregator. Using LLM fallback.")
 
269
  fallback_text = query_llm_for_additional_info(topic, "")
270
  cleaned_fb = rewrite_in_professional_style(topic, fallback_text)
271
  fallback_data = [{
 
276
  }]
277
  return _draft_professional_report(topic, fallback_data)
278
  else:
279
+ final_list = []
280
+ idx = 0
281
+ for source in combined:
282
+ idx += 1
283
+ link = source.get("link", "")
284
+ snippet = source.get("snippet", "")
285
+ title = source.get("title", "")
286
+ cleaned_text = rewrite_in_professional_style(topic, snippet)
287
+ if cleaned_text.strip():
288
+ final_list.append({
289
+ "index": idx,
290
+ "title": title,
291
+ "link": link,
292
+ "cleaned_text": cleaned_text
293
+ })
294
+ if not final_list:
295
+ print("[LOG] Aggregator produced no final content after rewriting. Using LLM fallback.")
296
+ fallback_text = query_llm_for_additional_info(topic, "")
297
+ cleaned_fb = rewrite_in_professional_style(topic, fallback_text)
298
+ fallback_data = [{
299
+ "index": 1,
300
+ "title": "Fallback Info",
301
+ "link": "N/A",
302
+ "cleaned_text": cleaned_fb
303
+ }]
304
+ return _draft_professional_report(topic, fallback_data)
305
+ return _draft_professional_report(topic, final_list)
306
+
307
+ def _draft_professional_report(topic: str, sources_list: list) -> str:
308
+ merged_text = []
309
+ footnotes = []
310
+ for s in sources_list:
311
+ footnotes.append(f"[^{s['index']}]: {s['link']}")
312
+ text_block = (
313
+ f"Source {s['index']} Title: {s['title']}\n"
314
+ f"FootnoteRef: [^{s['index']}]\n"
315
+ f"Text:\n{s['cleaned_text']}\n"
316
+ )
317
+ merged_text.append(text_block)
318
+ all_content = "\n\n".join(merged_text)
319
+ system_prompt = f"""You are a highly skilled professional research analyst.
320
+ You have access to multiple authoritative sources on the topic: {topic}.
321
+ Your task is to produce a comprehensive and detailed formal research report that includes the following sections:
322
+
323
+ 1. **Title:** Use the topic as the title of the report.
324
+ 2. **Executive Summary:** Provide a concise overview highlighting the key findings and insights.
325
+ 3. **Introduction:** Introduce the topic, its relevance, and the scope of the report.
326
+ 4. **Main Body:**
327
+ - **Sub-heading 1:** Summarize insights from Source 1.
328
+ - **Sub-heading 2:** Summarize insights from Source 2.
329
+ - *(Continue as needed for all sources)*
330
+ - **Analysis:** Provide an in-depth analysis combining information from all sources.
331
+ 5. **Conclusion:** Present final thoughts, implications, and potential future directions.
332
+ 6. **References:** List all sources with numeric footnote markers [^1], [^2], etc.
333
+
334
+ **Requirements:**
335
+ - **Length:** The report must be at least **1,000 words** in total.
336
+ - **Content Quality:**
337
+ - Incorporate relevant facts, figures, and statistics.
338
+ - Use professional and clear language.
339
+ - Ensure each section is well-developed without unnecessary repetition.
340
+ - **Structure:** Maintain a logical and cohesive flow throughout the report.
341
+ - **Formatting:** Use proper formatting for headings, sub-headings, and references.
342
+
343
+ **Below is the aggregated content from your sources (with footnote references):**
344
+ -----------------------------------------------------------------------
345
+ {all_content}
346
+ -----------------------------------------------------------------------
347
+ **Footnotes:**
348
+ {chr(10).join(footnotes)}
349
+ """
350
+ def count_tokens(text: str) -> int:
351
+ tokenizer = tiktoken.get_encoding("cl100k_base")
352
+ tokens = tokenizer.encode(text)
353
+ return len(tokens)
354
+ max_tokens = 6000
355
+ system_prompt_tokens = count_tokens(system_prompt)
356
+ all_content_tokens = count_tokens(all_content)
357
+ total_tokens = system_prompt_tokens + all_content_tokens
358
+ print(f"[DEBUG] Total tokens before optimization: {total_tokens}")
359
+ if total_tokens > max_tokens:
360
+ allowed_tokens_for_content = max_tokens - system_prompt_tokens - 100
361
+ if allowed_tokens_for_content <= 0:
362
+ print("[ERROR] System prompt alone exceeds the token limit.")
363
+ return "The system prompt exceeds the token limit. Please reduce the complexity of your research."
364
+ tokenizer = tiktoken.get_encoding("cl100k_base")
365
+ all_content_tokens_list = tokenizer.encode(all_content)
366
+ truncated_tokens = all_content_tokens_list[:allowed_tokens_for_content]
367
+ truncated_content = tokenizer.decode(truncated_tokens)
368
+ system_prompt = system_prompt.replace(all_content, truncated_content + "\n\n[Content truncated to fit token limits.]")
369
+ print(f"[DEBUG] Truncated content to fit token limits: {len(truncated_tokens)} tokens")
370
+ try:
371
+ response = call_deepseek_api(
372
+ system_prompt=system_prompt,
373
+ user_prompt="",
374
+ max_tokens=3000,
375
+ temperature=0.7
376
+ )
377
+ final_report = response.strip()
378
+ word_count = len(final_report.split())
379
+ if word_count < 1000:
380
+ print(f"[WARNING] Generated report is below desired length: {word_count} words.")
381
+ return final_report
382
+ except Exception as e:
383
+ print("[ERROR] Could not finalize professional report:", e)
384
+ return "An unexpected error occurred. Please try again later."
385
+
386
+ def call_deepseek_api(system_prompt: str, user_prompt: str, max_tokens: int, temperature: float) -> str:
387
+ print("[LOG] Communicating with DeepSeek R1 via OpenRouter API.")
388
+ try:
389
+ headers = {
390
+ "Authorization": f"Bearer {os.environ.get('DEEPSEEK_API_KEY')}",
391
+ "HTTP-Referer": "https://yourdomain.com", # Replace with your site URL if needed
392
+ "X-Title": "MyPod", # Replace with your site name if needed
393
+ "Content-Type": "application/json"
394
+ }
395
+ data = {
396
+ "model": "deepseek/deepseek-r1:free",
397
+ "messages": [
398
+ {"role": "system", "content": system_prompt},
399
+ {"role": "user", "content": user_prompt}
400
+ ],
401
+ "max_tokens": max_tokens,
402
+ "temperature": temperature
403
+ }
404
+ response = requests.post("https://openrouter.ai/api/v1/chat/completions",
405
+ headers=headers, data=json.dumps(data))
406
+ response.raise_for_status()
407
+ json_response = response.json()
408
+ if "choices" not in json_response:
409
+ raise ValueError("Invalid response from OpenRouter API: 'choices' key missing.")
410
+ return json_response["choices"][0]["message"]["content"]
411
+ except requests.exceptions.HTTPError as e:
412
+ status_code = e.response.status_code
413
+ if status_code == 503:
414
+ print("[ERROR] Service Unavailable from Deepseek API.")
415
+ raise ValueError("Service is currently unavailable. Please try again later.")
416
+ elif status_code == 413:
417
+ print("[ERROR] Request too large for Deepseek API.")
418
+ raise ValueError("The request is too large. Please reduce the input size and try again.")
419
+ else:
420
+ print("[ERROR] Deepseek API error:", e)
421
+ raise ValueError("An error occurred while generating the report. Please try again later.")
422
+ except Exception as e:
423
+ print("[ERROR] Could not communicate with Deepseek API:", e)
424
+ raise ValueError("An unexpected error occurred. Please try again later.")
425
+
426
+ def generate_pdf_from_markdown(markdown_text: str) -> bytes:
427
+ try:
428
+ html = markdown.markdown(markdown_text, extensions=["extra", "tables", "toc"])
429
+ pdf_bytes = pdfkit.from_string(html, False)
430
+ return pdf_bytes
431
+ except Exception as e:
432
+ print(f"[ERROR] Failed to generate PDF from Markdown: {e}")
433
+ return b""
434
+
435
+ def fetch_wikipedia_summary(topic: str) -> str:
436
+ print("[LOG] Fetching Wikipedia summary for:", topic)
437
+ try:
438
+ search_url = (
439
+ f"https://en.wikipedia.org/w/api.php?action=opensearch&search={requests.utils.quote(topic)}"
440
+ "&limit=1&namespace=0&format=json"
441
+ )
442
+ resp = requests.get(search_url)
443
+ if resp.status_code != 200:
444
+ print(f"[ERROR] Failed to fetch Wikipedia search results for {topic}")
445
+ return ""
446
+ data = resp.json()
447
+ if len(data) > 1 and data[1]:
448
+ title = data[1][0]
449
+ summary_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(title)}"
450
+ s_resp = requests.get(summary_url)
451
+ if s_resp.status_code == 200:
452
+ s_data = s_resp.json()
453
+ if "extract" in s_data:
454
+ print("[LOG] Wikipedia summary fetched successfully.")
455
+ return s_data["extract"]
456
+ return ""
457
+ except Exception as e:
458
+ print(f"[ERROR] Exception during Wikipedia summary fetch: {e}")
459
+ return ""
460
+
461
+ def fetch_rss_feed(feed_url: str) -> list:
462
+ print("[LOG] Fetching RSS feed:", feed_url)
463
+ try:
464
+ resp = requests.get(feed_url)
465
+ if resp.status_code != 200:
466
+ print(f"[ERROR] Failed to fetch RSS feed: {feed_url}")
467
+ return []
468
+ soup = BeautifulSoup(resp.content, "xml")
469
+ items = soup.find_all("item")
470
+ return items
471
+ except Exception as e:
472
+ print(f"[ERROR] Exception fetching RSS feed {feed_url}: {e}")
473
+ return []
474
 
475
+ def find_relevant_article(items, topic: str, min_match=2) -> tuple:
476
+ print("[LOG] Finding relevant articles...")
477
+ keywords = re.findall(r'\w+', topic.lower())
478
+ for item in items:
479
+ title = item.find("title").get_text().strip() if item.find("title") else ""
480
+ description = item.find("description").get_text().strip() if item.find("description") else ""
481
+ text = (title + " " + description).lower()
482
+ matches = sum(1 for kw in keywords if kw in text)
483
+ if matches >= min_match:
484
+ link = item.find("link").get_text().strip() if item.find("link") else ""
485
+ print(f"[LOG] Relevant article found: {title}")
486
+ return title, description, link
487
+ return None, None, None
488
 
489
+ # ------------------------------
490
+ # Preprocess text for TTS
491
+ # ------------------------------
492
+ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
493
+ text = re.sub(r"\bNo\.\b", "Number", text)
494
+ text = re.sub(r"\b(?i)SaaS\b", "sass", text)
495
+ abbreviations_as_words = {"NASA", "NATO", "UNESCO"}
496
+ def insert_periods_for_abbrev(m):
497
+ abbr = m.group(0)
498
+ if abbr in abbreviations_as_words:
499
+ return abbr
500
+ return ".".join(list(abbr)) + "."
501
+ text = re.sub(r"\b([A-Z]{2,})\b", insert_periods_for_abbrev, text)
502
+ text = re.sub(r"\.\.", ".", text)
503
+ return text
504
 
505
  def _spell_digits(d: str) -> str:
506
  digit_map = {
 
510
  }
511
  return " ".join(digit_map[ch] for ch in d if ch in digit_map)
512
 
513
+ def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
514
+ if custom_music_path:
515
+ music_path = custom_music_path
516
+ else:
517
+ music_path = "bg_music.mp3"
518
+ try:
519
+ bg_music = AudioSegment.from_file(music_path, format="mp3")
520
+ except Exception as e:
521
+ print("[ERROR] Failed to load background music:", e)
522
+ return spoken
523
+ bg_music = bg_music - 18.0
524
+ total_length_ms = len(spoken) + 2000
525
+ looped_music = AudioSegment.empty()
526
+ while len(looped_music) < total_length_ms:
527
+ looped_music += bg_music
528
+ looped_music = looped_music[:total_length_ms]
529
+ final_mix = looped_music.overlay(spoken, position=2000)
530
+ return final_mix