siddhartharyaai commited on
Commit
e19a7d9
·
verified ·
1 Parent(s): 14236f7

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +729 -0
utils.py ADDED
@@ -0,0 +1,729 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import requests
5
+ import tempfile
6
+ from bs4 import BeautifulSoup
7
+ from typing import List, Literal
8
+ from pydantic import BaseModel
9
+ from pydub import AudioSegment, effects
10
+ from transformers import pipeline
11
+ import yt_dlp
12
+ import tiktoken
13
+ from groq import Groq # Retained for other functions if needed
14
+ import numpy as np
15
+ import torch
16
+ import random
17
+
18
+ import base64
19
+ from io import BytesIO
20
+ import altair as alt
21
+ import pdfkit
22
+ import altair_saver # For PNG export with Altair
23
+
24
+ ###############################################################################
25
+ # Pydantic Models
26
+ ###############################################################################
27
+ class DialogueItem(BaseModel):
28
+ speaker: Literal["Jane", "John"]
29
+ display_speaker: str = "Jane"
30
+ text: str
31
+
32
+ class Dialogue(BaseModel):
33
+ dialogue: List[DialogueItem]
34
+
35
+ ###############################################################################
36
+ # ASR Pipeline (Whisper tiny)
37
+ ###############################################################################
38
+ asr_pipeline = pipeline(
39
+ "automatic-speech-recognition",
40
+ model="openai/whisper-tiny.en",
41
+ device=0 if torch.cuda.is_available() else -1
42
+ )
43
+
44
+ ###############################################################################
45
+ # Helper: Truncate text if it exceeds token limit
46
+ ###############################################################################
47
+ def truncate_text(text, max_tokens=2048):
48
+ print("[LOG] Truncating text if needed.")
49
+ tokenizer = tiktoken.get_encoding("cl100k_base")
50
+ tokens = tokenizer.encode(text)
51
+ if len(tokens) > max_tokens:
52
+ print("[LOG] Text too long, truncating.")
53
+ return tokenizer.decode(tokens[:max_tokens])
54
+ return text
55
+
56
+ ###############################################################################
57
+ # Extract text from a URL
58
+ ###############################################################################
59
+ def extract_text_from_url(url):
60
+ print("[LOG] Extracting text from URL:", url)
61
+ try:
62
+ headers = {
63
+ "User-Agent": (
64
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
65
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
66
+ "Chrome/115.0.0.0 Safari/537.36"
67
+ )
68
+ }
69
+ response = requests.get(url, headers=headers)
70
+ if response.status_code != 200:
71
+ print(f"[ERROR] Failed to fetch URL: {url} with status code {response.status_code}")
72
+ return ""
73
+ soup = BeautifulSoup(response.text, 'html.parser')
74
+ for script in soup(["script", "style"]):
75
+ script.decompose()
76
+ text = soup.get_text(separator=' ')
77
+ print("[LOG] Text extraction from URL successful.")
78
+ return text
79
+ except Exception as e:
80
+ print(f"[ERROR] Exception during text extraction from URL: {e}")
81
+ return ""
82
+
83
+ ###############################################################################
84
+ # Optional pitch-shift (unused)
85
+ ###############################################################################
86
+ def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
87
+ print(f"[LOG] Shifting pitch by {semitones} semitones.")
88
+ new_sample_rate = int(audio.frame_rate * (2.0 ** (semitones / 12.0)))
89
+ shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
90
+ return shifted_audio.set_frame_rate(audio.frame_rate)
91
+
92
+ ###############################################################################
93
+ # Check if text is sufficient
94
+ ###############################################################################
95
+ def is_sufficient(text: str, min_word_count: int = 500) -> bool:
96
+ word_count = len(text.split())
97
+ print(f"[DEBUG] Aggregated word count: {word_count}")
98
+ return word_count >= min_word_count
99
+
100
+ ###############################################################################
101
+ # LLM fallback if insufficient data
102
+ ###############################################################################
103
+ def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
104
+ print("[LOG] Querying LLM for additional info.")
105
+ system_prompt = (
106
+ "You are an AI assistant with extensive knowledge up to 2023-10. "
107
+ "Provide additional relevant information on the following topic based on your knowledge base.\n\n"
108
+ f"Topic: {topic}\n\n"
109
+ f"Existing Information: {existing_text}\n\n"
110
+ "Please add more insightful details, facts, and perspectives."
111
+ )
112
+ groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
113
+ try:
114
+ response = groq_client.chat.completions.create(
115
+ messages=[{"role": "system", "content": system_prompt}],
116
+ model="llama-3.3-70b-versatile",
117
+ max_tokens=1024,
118
+ temperature=0.7
119
+ )
120
+ except Exception as e:
121
+ print("[ERROR] Groq API error during fallback:", e)
122
+ return ""
123
+ info = response.choices[0].message.content.strip()
124
+ print("[DEBUG] Additional info from LLM:")
125
+ print(info)
126
+ return info
127
+
128
+ ###############################################################################
129
+ # Rewrite text in professional style
130
+ ###############################################################################
131
+ def rewrite_in_professional_style(topic: str, raw_text: str) -> str:
132
+ if not raw_text.strip():
133
+ return ""
134
+
135
+ system_prompt = (
136
+ "You are a professional writing assistant. Rewrite the provided text:\n"
137
+ "1) Use clear, fluent, professional English.\n"
138
+ "2) Keep it on-topic about {topic}, removing disclaimers or non-English filler.\n"
139
+ "3) Summarize if too long, but keep important data/facts.\n"
140
+ "4) Organize in paragraphs/bullet points.\n"
141
+ "5) Avoid referencing any rewriting.\n"
142
+ ).format(topic=topic)
143
+
144
+ user_prompt = f"Please rewrite this text:\n\n{raw_text}"
145
+ groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
146
+ try:
147
+ response = groq_client.chat.completions.create(
148
+ messages=[
149
+ {"role": "system", "content": system_prompt},
150
+ {"role": "user", "content": user_prompt}
151
+ ],
152
+ model="llama-3.3-70b-versatile",
153
+ max_tokens=1024,
154
+ temperature=0.7
155
+ )
156
+ return response.choices[0].message.content.strip()
157
+ except Exception as e:
158
+ print("[ERROR] rewriting text via LLM:", e)
159
+ return raw_text
160
+
161
+ ###############################################################################
162
+ # Legacy research: RSS + Wikipedia
163
+ ###############################################################################
164
+ def research_topic(topic: str) -> str:
165
+ sources = {
166
+ "BBC": "https://feeds.bbci.co.uk/news/rss.xml",
167
+ "CNN": "http://rss.cnn.com/rss/edition.rss",
168
+ "Associated Press": "https://apnews.com/apf-topnews",
169
+ "NDTV": "https://www.ndtv.com/rss/top-stories",
170
+ "Times of India": "https://timesofindia.indiatimes.com/rssfeeds/296589292.cms",
171
+ "The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
172
+ "Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
173
+ "Google News - Custom": f"https://news.google.com/rss/search?q={requests.utils.quote(topic)}&hl=en-IN&gl=IN&ceid=IN:en",
174
+ }
175
+
176
+ summary_parts = []
177
+ wiki_summary = fetch_wikipedia_summary(topic)
178
+ if wiki_summary:
179
+ summary_parts.append(f"From Wikipedia: {wiki_summary}")
180
+
181
+ for name, feed_url in sources.items():
182
+ try:
183
+ items = fetch_rss_feed(feed_url)
184
+ if not items:
185
+ continue
186
+ title, desc, link = find_relevant_article(items, topic, min_match=2)
187
+ if link:
188
+ article_text = fetch_article_text(link)
189
+ if article_text:
190
+ summary_parts.append(f"From {name}: {article_text}")
191
+ else:
192
+ summary_parts.append(f"From {name}: {title} - {desc}")
193
+ except Exception as e:
194
+ print(f"[ERROR] Error fetching from {name} RSS feed:", e)
195
+ continue
196
+
197
+ aggregated_info = " ".join(summary_parts)
198
+ print("[DEBUG] Aggregated info from primary sources:")
199
+ print(aggregated_info)
200
+
201
+ if not is_sufficient(aggregated_info):
202
+ print("[LOG] Not enough info. LLM fallback.")
203
+ extra_info = query_llm_for_additional_info(topic, aggregated_info)
204
+ if extra_info:
205
+ aggregated_info += " " + extra_info
206
+ else:
207
+ print("[ERROR] LLM fallback gave nothing.")
208
+ if not aggregated_info:
209
+ return f"Sorry, no info on '{topic}'."
210
+ return aggregated_info
211
+
212
+ def fetch_wikipedia_summary(topic: str) -> str:
213
+ print("[LOG] Fetching Wikipedia summary for:", topic)
214
+ try:
215
+ search_url = (
216
+ f"https://en.wikipedia.org/w/api.php?action=opensearch&search={requests.utils.quote(topic)}"
217
+ "&limit=1&namespace=0&format=json"
218
+ )
219
+ resp = requests.get(search_url)
220
+ if resp.status_code != 200:
221
+ print(f"[ERROR] Wikipedia fetch fail for {topic}")
222
+ return ""
223
+ data = resp.json()
224
+ if len(data) > 1 and data[1]:
225
+ title = data[1][0]
226
+ summary_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(title)}"
227
+ s_resp = requests.get(summary_url)
228
+ if s_resp.status_code == 200:
229
+ s_data = s_resp.json()
230
+ if "extract" in s_data:
231
+ print("[LOG] Wikipedia summary found.")
232
+ return s_data["extract"]
233
+ return ""
234
+ except Exception as e:
235
+ print(f"[ERROR] Wikipedia summary error: {e}")
236
+ return ""
237
+
238
+ def fetch_rss_feed(feed_url: str) -> list:
239
+ print("[LOG] RSS feed:", feed_url)
240
+ try:
241
+ resp = requests.get(feed_url)
242
+ if resp.status_code != 200:
243
+ print(f"[ERROR] RSS feed fail: {feed_url}")
244
+ return []
245
+ soup = BeautifulSoup(resp.content, "xml")
246
+ return soup.find_all("item")
247
+ except Exception as e:
248
+ print(f"[ERROR] RSS error: {e}")
249
+ return []
250
+
251
+ def find_relevant_article(items, topic: str, min_match=2) -> tuple:
252
+ print("[LOG] Searching relevant article...")
253
+ keywords = re.findall(r'\w+', topic.lower())
254
+ for item in items:
255
+ title = item.find("title").get_text().strip() if item.find("title") else ""
256
+ description = item.find("description").get_text().strip() if item.find("description") else ""
257
+ text = (title + " " + description).lower()
258
+ matches = sum(1 for kw in keywords if kw in text)
259
+ if matches >= min_match:
260
+ link = item.find("link").get_text().strip() if item.find("link") else ""
261
+ print(f"[LOG] Relevant article found: {title}")
262
+ return title, description, link
263
+ return None, None, None
264
+
265
+ def fetch_article_text(link: str) -> str:
266
+ print("[LOG] Fetching article text from:", link)
267
+ if not link:
268
+ print("[LOG] No link.")
269
+ return ""
270
+ try:
271
+ r = requests.get(link)
272
+ if r.status_code != 200:
273
+ print(f"[ERROR] Article fetch fail: {link}")
274
+ return ""
275
+ soup = BeautifulSoup(r.text, 'html.parser')
276
+ paragraphs = soup.find_all("p")
277
+ text = " ".join(p.get_text() for p in paragraphs[:5])
278
+ print("[LOG] Article text fetched.")
279
+ return text.strip()
280
+ except Exception as e:
281
+ print(f"[ERROR] fetch_article_text error: {e}")
282
+ return ""
283
+
284
+ ###############################################################################
285
+ # Script generation for podcasts
286
+ ###############################################################################
287
+ def generate_script(
288
+ system_prompt: str,
289
+ input_text: str,
290
+ tone: str,
291
+ target_length: str,
292
+ host_name: str = "Jane",
293
+ guest_name: str = "John",
294
+ sponsor_style: str = "Separate Break",
295
+ sponsor_provided=None
296
+ ):
297
+ import streamlit as st
298
+ print("[LOG] Generating script. Tone:", tone, "Length:", target_length)
299
+
300
+ if (host_name == "Jane" or not host_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
301
+ host_name = "Isha"
302
+ if (guest_name == "John" or not guest_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
303
+ guest_name = "Aarav"
304
+
305
+ words_per_minute = 150
306
+ numeric_minutes = 3
307
+ match = re.search(r"(\d+)", target_length)
308
+ if match:
309
+ numeric_minutes = int(match.group(1))
310
+
311
+ min_words = max(50, numeric_minutes * 100)
312
+ max_words = numeric_minutes * words_per_minute
313
+
314
+ tone_map = {
315
+ "Humorous": "funny and exciting",
316
+ "Formal": "business-like, well-structured, professional",
317
+ "Casual": "like a conversation between close friends",
318
+ "Youthful": "energetic and lively"
319
+ }
320
+ chosen_tone = tone_map.get(tone, "casual")
321
+
322
+ if sponsor_provided:
323
+ if sponsor_style == "Separate Break":
324
+ sponsor_instructions = "If sponsor content is provided, place in a separate ad break (~30s)."
325
+ else:
326
+ sponsor_instructions = "If sponsor content is provided, blend (~30s) into the conversation."
327
+ else:
328
+ sponsor_instructions = ""
329
+
330
+ prompt = (
331
+ f"{system_prompt}\n"
332
+ f"TONE: {chosen_tone}\n"
333
+ f"TARGET LENGTH: {target_length} (~{min_words}-{max_words} words)\n"
334
+ f"INPUT TEXT: {input_text}\n\n"
335
+ f"# Sponsor Style Instruction:\n{sponsor_instructions}\n\n"
336
+ "Output must be JSON:\n"
337
+ "{\n"
338
+ ' "dialogue": [\n'
339
+ ' {"speaker": "Jane", "text": "..."},\n'
340
+ ' {"speaker": "John", "text": "..."}\n'
341
+ " ]\n"
342
+ "}"
343
+ )
344
+ print("[LOG] Prompt to LLM:", prompt)
345
+
346
+ if st.session_state.get("language_selection") == "Hinglish":
347
+ prompt += "\n\nPlease generate the script in Romanized Hindi.\n"
348
+ elif st.session_state.get("language_selection") == "Hindi":
349
+ prompt += "\n\nPlease generate the script exclusively in Hindi.\n"
350
+
351
+ try:
352
+ headers = {
353
+ "Authorization": f"Bearer {os.environ.get('DEEPSEEK_API_KEY')}",
354
+ "Content-Type": "application/json"
355
+ }
356
+ data = {
357
+ "model": "deepseek/deepseek-r1",
358
+ "messages": [{"role": "user", "content": prompt}],
359
+ "max_tokens": 2048,
360
+ "temperature": 0.7
361
+ }
362
+ r = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, data=json.dumps(data))
363
+ r.raise_for_status()
364
+ raw_content = r.json()["choices"][0]["message"]["content"].strip()
365
+ except Exception as e:
366
+ print("[ERROR] LLM error:", e)
367
+ raise ValueError(f"Error with LLM call: {e}")
368
+
369
+ start_index = raw_content.find('{')
370
+ end_index = raw_content.rfind('}')
371
+ if start_index == -1 or end_index == -1:
372
+ raise ValueError("No JSON found in LLM response.")
373
+
374
+ json_str = raw_content[start_index:end_index+1].strip()
375
+ try:
376
+ data_js = json.loads(json_str)
377
+ diag_list = data_js.get("dialogue", [])
378
+ for d in diag_list:
379
+ raw_speaker = d.get("speaker", "Jane")
380
+ if raw_speaker.lower() == host_name.lower():
381
+ d["speaker"] = "Jane"
382
+ d["display_speaker"] = host_name
383
+ elif raw_speaker.lower() == guest_name.lower():
384
+ d["speaker"] = "John"
385
+ d["display_speaker"] = guest_name
386
+ else:
387
+ d["speaker"] = "Jane"
388
+ d["display_speaker"] = raw_speaker
389
+
390
+ final_items = []
391
+ for d in diag_list:
392
+ if "display_speaker" not in d:
393
+ d["display_speaker"] = d["speaker"]
394
+ final_items.append(DialogueItem(**d))
395
+ return Dialogue(dialogue=final_items)
396
+ except Exception as e:
397
+ print("[ERROR] JSON parse error:", e)
398
+ raise ValueError(f"Failed to parse JSON from LLM: {e}")
399
+
400
+ ###############################################################################
401
+ # Transcribe YouTube (RapidAPI)
402
+ ###############################################################################
403
+ def transcribe_youtube_video(video_url: str) -> str:
404
+ print("[LOG] Transcribing YouTube via RapidAPI:", video_url)
405
+ vid_match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", video_url)
406
+ if not vid_match:
407
+ raise ValueError("Invalid YouTube URL, cannot find video ID.")
408
+ video_id = vid_match.group(1)
409
+
410
+ base_url = "https://youtube-transcriptor.p.rapidapi.com/transcript"
411
+ params = {"video_id": video_id, "lang": "en"}
412
+ headers = {
413
+ "x-rapidapi-host": "youtube-transcriptor.p.rapidapi.com",
414
+ "x-rapidapi-key": os.environ.get("RAPIDAPI_KEY")
415
+ }
416
+ try:
417
+ r = requests.get(base_url, headers=headers, params=params, timeout=30)
418
+ r.raise_for_status()
419
+ data = r.json()
420
+ if not isinstance(data, list) or not data:
421
+ raise ValueError("No transcript data returned.")
422
+ text = data[0].get('transcriptionAsText', '').strip()
423
+ if not text:
424
+ raise ValueError("Transcript is empty.")
425
+ return text
426
+ except Exception as e:
427
+ print("[ERROR] RapidAPI transcription error:", e)
428
+ raise ValueError(f"Error transcribing YouTube: {e}")
429
+
430
+ ###############################################################################
431
+ # TTS => mp3 file path
432
+ ###############################################################################
433
+ def generate_audio_mp3(text: str, speaker: str) -> str:
434
+ import streamlit as st
435
+ print(f"[LOG] Generating audio for speaker: {speaker}")
436
+ language = st.session_state.get("language_selection", "English (American)")
437
+
438
+ if language == "English (American)":
439
+ # DEEPGRAM approach
440
+ ...
441
+ else:
442
+ # MURF approach
443
+ ...
444
+ return "...some_mp3_file_path..."
445
+
446
+ def transcribe_youtube_video_OLD_YTDLP(video_url: str):
447
+ pass
448
+
449
+ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
450
+ return text
451
+
452
+ def _spell_digits(d: str) -> str:
453
+ return ""
454
+
455
+ ###############################################################################
456
+ # Mix with BG music
457
+ ###############################################################################
458
+ def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
459
+ if custom_music_path:
460
+ music_path = custom_music_path
461
+ else:
462
+ music_path = "bg_music.mp3"
463
+ try:
464
+ bg_music = AudioSegment.from_file(music_path, format="mp3")
465
+ except Exception as e:
466
+ print("[ERROR] Could not load bg music:", e)
467
+ return spoken
468
+ bg_music = bg_music - 18.0
469
+ total_len = len(spoken) + 2000
470
+ looped = AudioSegment.empty()
471
+ while len(looped) < total_len:
472
+ looped += bg_music
473
+ looped = looped[:total_len]
474
+ final_mix = looped.overlay(spoken, position=2000)
475
+ return final_mix
476
+
477
+ ###############################################################################
478
+ # Q&A
479
+ ###############################################################################
480
+ def call_groq_api_for_qa(system_prompt: str) -> str:
481
+ try:
482
+ headers = {
483
+ "Authorization": f"Bearer {os.environ.get('DEEPSEEK_API_KEY')}",
484
+ "Content-Type": "application/json",
485
+ "Accept": "application/json"
486
+ }
487
+ data = {
488
+ "model": "deepseek/deepseek-r1",
489
+ "messages": [{"role": "user", "content": system_prompt}],
490
+ "max_tokens": 512,
491
+ "temperature": 0.7
492
+ }
493
+ r = requests.post("https://openrouter.ai/api/v1/chat/completions",
494
+ headers=headers, data=json.dumps(data))
495
+ r.raise_for_status()
496
+ return r.json()["choices"][0]["message"]["content"].strip()
497
+ except Exception as e:
498
+ print("[ERROR] QA call failed:", e)
499
+ fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering now."}
500
+ return json.dumps(fallback)
501
+
502
+ ###############################################################################
503
+ # Bing multi-search
504
+ ###############################################################################
505
+ def fetch_bing_results(query: str, count: int = 12) -> list:
506
+ """
507
+ We also fix possible newlines in the key.
508
+ """
509
+ bing_api_key = os.environ.get("BING_API_KEY")
510
+ if bing_api_key:
511
+ bing_api_key = bing_api_key.strip() # remove trailing newline if any
512
+ else:
513
+ return []
514
+
515
+ print("[LOG] Attempting Bing Web Search for:", query)
516
+ url = "https://api.bing.microsoft.com/v7.0/search"
517
+ headers = {"Ocp-Apim-Subscription-Key": bing_api_key}
518
+ params = {"q": query, "count": count}
519
+ try:
520
+ resp = requests.get(url, headers=headers, params=params, timeout=15)
521
+ if resp.status_code != 200:
522
+ print("[ERROR] Bing search code:", resp.status_code)
523
+ print("[DEBUG] Bing search body:", resp.text)
524
+ resp.raise_for_status()
525
+ data = resp.json()
526
+ web_pages = data.get("webPages", {}).get("value", [])
527
+ results = []
528
+ for wp in web_pages:
529
+ results.append({
530
+ "title": wp.get("name", ""),
531
+ "link": wp.get("url", ""),
532
+ "snippet": wp.get("snippet", "")
533
+ })
534
+ return results
535
+ except Exception as e:
536
+ print("[ERROR] Bing search failed:", e)
537
+ return []
538
+
539
+ ###############################################################################
540
+ # Combine all cleaned sources
541
+ ###############################################################################
542
+ def _gather_cleaned_sources(topic: str, sources_list: list) -> str:
543
+ combined_body = []
544
+ for s in sources_list:
545
+ snippet = (
546
+ f"**Title**: {s['title']}\n"
547
+ f"**Link**: {s['link']}\n\n"
548
+ f"{s['cleaned_text']}\n\n"
549
+ )
550
+ combined_body.append(snippet)
551
+ return "\n".join(combined_body)
552
+
553
+ ###############################################################################
554
+ # Reorganize final text into multi-section "Professional Research Report"
555
+ # with dynamic headings
556
+ ###############################################################################
557
+ def rewrite_into_pro_outline(topic: str, combined_body: str) -> str:
558
+ prompt = f"""
559
+ You are a professional research writer. Please produce a final research report with this structure:
560
+
561
+ # Professional Research Report
562
+ ## {{Topic}}
563
+
564
+ ### Executive Summary
565
+ (Write a concise summary of the key insights.)
566
+
567
+ Then automatically determine 3-5 relevant section headings for this content, giving each a descriptive title.
568
+ Follow with a 'Conclusion' section.
569
+ Finally add 'References & Footnotes', then '(End of Professional Report)'.
570
+
571
+ Adapt headings to the actual content. Avoid headings that don't make sense for the user topic.
572
+
573
+ Topic: {topic}
574
+
575
+ Raw Combined Text:
576
+ {combined_body}
577
+ """
578
+ groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
579
+ try:
580
+ resp = groq_client.chat.completions.create(
581
+ messages=[
582
+ {"role": "system", "content": "You are a professional research writer."},
583
+ {"role": "user", "content": prompt}
584
+ ],
585
+ model="llama-3.3-70b-versatile",
586
+ max_tokens=4096,
587
+ temperature=0.7
588
+ )
589
+ return resp.choices[0].message.content.strip()
590
+ except Exception as e:
591
+ print("[ERROR] rewriting into pro outline failed:", e)
592
+ return combined_body
593
+
594
+ ###############################################################################
595
+ # The main function that queries Google & Bing
596
+ ###############################################################################
597
+ def perform_deep_research(topic: str) -> str:
598
+ # HF SPACES UPGRADE: remove trailing newlines from env secrets
599
+ google_cse_id = os.environ.get("GOOGLE_CSE_ID")
600
+ google_api_key = os.environ.get("GOOGLE_API_KEY")
601
+ if google_cse_id:
602
+ google_cse_id = google_cse_id.strip()
603
+ if google_api_key:
604
+ google_api_key = google_api_key.strip()
605
+
606
+ all_sources = []
607
+
608
+ # GOOGLE
609
+ google_results = []
610
+ if google_cse_id and google_api_key:
611
+ try:
612
+ print("[LOG] Attempting Google CSE for:", topic)
613
+ url = "https://customsearch.googleapis.com/customsearch/v1"
614
+ params = {
615
+ "q": topic,
616
+ "cx": google_cse_id,
617
+ "key": google_api_key,
618
+ "num": 12
619
+ }
620
+ # We'll do an explicit check/log
621
+ resp = requests.get(url, params=params, timeout=15)
622
+ if resp.status_code != 200:
623
+ print("[ERROR] Google CSE status code:", resp.status_code)
624
+ print("[DEBUG] Google CSE response body:", resp.text)
625
+ resp.raise_for_status()
626
+ data = resp.json()
627
+ items = data.get("items", [])
628
+ for it in items:
629
+ google_results.append({
630
+ "title": it.get("title", ""),
631
+ "link": it.get("link", ""),
632
+ "snippet": it.get("snippet", "")
633
+ })
634
+ except requests.HTTPError as e:
635
+ print("[ERROR] Google approach failed (HTTPError):", e)
636
+ except Exception as e:
637
+ print("[ERROR] Google approach failed (other error):", e)
638
+
639
+ # BING
640
+ bing_results = fetch_bing_results(topic, count=12)
641
+ combined_raw = google_results + bing_results
642
+
643
+ if not combined_raw:
644
+ print("[LOG] No direct search results, fallback to older approach.")
645
+ fallback_info = research_topic(topic)
646
+ cleaned_fb = rewrite_in_professional_style(topic, fallback_info)
647
+ all_sources = [{
648
+ "index": 1,
649
+ "title": "Fallback Info",
650
+ "link": "N/A",
651
+ "cleaned_text": cleaned_fb
652
+ }]
653
+ else:
654
+ idx = 0
655
+ for res in combined_raw:
656
+ idx += 1
657
+ link = res["link"]
658
+ snippet = res["snippet"] or ""
659
+ title = res["title"] or ""
660
+ article_text = fetch_article_text(link)
661
+ if not article_text.strip():
662
+ article_text = snippet
663
+ cleaned = rewrite_in_professional_style(topic, article_text)
664
+ if cleaned.strip():
665
+ item = {
666
+ "index": idx,
667
+ "title": title,
668
+ "link": link,
669
+ "cleaned_text": cleaned
670
+ }
671
+ all_sources.append(item)
672
+
673
+ if not all_sources:
674
+ print("[LOG] None found after rewriting, fallback anyway.")
675
+ fb_info = research_topic(topic)
676
+ cleaned_fb = rewrite_in_professional_style(topic, fb_info)
677
+ all_sources = [{
678
+ "index": 1,
679
+ "title": "Fallback Info",
680
+ "link": "N/A",
681
+ "cleaned_text": cleaned_fb
682
+ }]
683
+
684
+ combined_body = _gather_cleaned_sources(topic, all_sources)
685
+ final_report = rewrite_into_pro_outline(topic, combined_body)
686
+ return final_report
687
+
688
+ ###############################################################################
689
+ # Chart Generation -> base64 PNG
690
+ ###############################################################################
691
+ def generate_simple_chart(data_list: list) -> str:
692
+ if not data_list:
693
+ return ""
694
+ import pandas as pd
695
+ df = pd.DataFrame({"value": data_list, "index": range(len(data_list))})
696
+ chart = alt.Chart(df).mark_bar().encode(
697
+ x="index:O",
698
+ y="value:Q"
699
+ ).properties(title="Sample Chart")
700
+ try:
701
+ png_bytes = altair_saver.save(chart, fp=None, fmt="png")
702
+ b64_img = base64.b64encode(png_bytes).decode("utf-8")
703
+ return f"![Chart](data:image/png;base64,{b64_img})"
704
+ except Exception as e:
705
+ print("[ERROR] Chart export error:", e)
706
+ return "*(Chart could not be generated in PNG form.)*"
707
+
708
+ ###############################################################################
709
+ # Markdown -> PDF
710
+ ###############################################################################
711
+ def generate_pdf_from_markdown(md_content: str) -> bytes:
712
+ import markdown
713
+ html_content = markdown.markdown(md_content, extensions=["extra", "tables", "toc"])
714
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as tmp_html:
715
+ tmp_html.write(html_content.encode("utf-8"))
716
+ tmp_html_path = tmp_html.name
717
+
718
+ tmp_pdf_path = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name
719
+ try:
720
+ pdfkit.from_file(tmp_html_path, tmp_pdf_path)
721
+ with open(tmp_pdf_path, "rb") as f:
722
+ pdf_bytes = f.read()
723
+ finally:
724
+ if os.path.exists(tmp_html_path):
725
+ os.remove(tmp_html_path)
726
+ if os.path.exists(tmp_pdf_path):
727
+ os.remove(tmp_pdf_path)
728
+
729
+ return pdf_bytes