siddhartharyaai commited on
Commit
48c504d
·
verified ·
1 Parent(s): 1530c27

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +520 -0
  2. prompts.py +58 -0
  3. qa.py +88 -0
  4. utils.py +641 -0
app.py ADDED
@@ -0,0 +1,520 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import time
3
+ import re
4
+ import os
5
+ import tempfile
6
+ import pypdf
7
+ from pydub import AudioSegment, effects
8
+ import difflib
9
+
10
+ #CORRECTED IMPORT
11
+ from utils import (
12
+ generate_script,
13
+ generate_audio_mp3,
14
+ mix_with_bg_music,
15
+ DialogueItem,
16
+ run_research_agent,
17
+ generate_report
18
+ )
19
+ from prompts import SYSTEM_PROMPT
20
+ from qa import transcribe_audio_deepgram, handle_qa_exchange
21
+
22
+ MAX_QA_QUESTIONS = 5 # up to 5 voice/text questions
23
+
24
+ def parse_user_edited_transcript(edited_text: str, host_name: str, guest_name: str):
25
+ pattern = r"\*\*(.+?)\*\*:\s*(.+)"
26
+ matches = re.findall(pattern, edited_text)
27
+
28
+ items = []
29
+ if not matches:
30
+ raw_name = host_name or "Jane"
31
+ text_line = edited_text.strip()
32
+ speaker = "Jane"
33
+ if raw_name.lower() == guest_name.lower():
34
+ speaker = "John"
35
+ item = DialogueItem(
36
+ speaker=speaker,
37
+ display_speaker=raw_name,
38
+ text=text_line
39
+ )
40
+ items.append(item)
41
+ return items
42
+
43
+ for (raw_name, text_line) in matches:
44
+ if raw_name.lower() == host_name.lower():
45
+ speaker = "Jane"
46
+ elif raw_name.lower() == guest_name.lower():
47
+ speaker = "John"
48
+ else:
49
+ speaker = "Jane"
50
+ item = DialogueItem(
51
+ speaker=speaker,
52
+ display_speaker=raw_name,
53
+ text=text_line
54
+ )
55
+ items.append(item)
56
+ return items
57
+
58
+ def regenerate_audio_from_dialogue(dialogue_items, custom_bg_music_path=None):
59
+ audio_segments = []
60
+ transcript = ""
61
+ crossfade_duration = 50 # ms
62
+
63
+ for item in dialogue_items:
64
+ audio_file = generate_audio_mp3(item.text, item.speaker)
65
+ seg = AudioSegment.from_file(audio_file, format="mp3")
66
+ audio_segments.append(seg)
67
+ transcript += f"**{item.display_speaker}**: {item.text}\n\n"
68
+ os.remove(audio_file)
69
+
70
+ if not audio_segments:
71
+ return None, "No audio segments were generated."
72
+
73
+ combined_spoken = audio_segments[0]
74
+ for seg in audio_segments[1:]:
75
+ combined_spoken = combined_spoken.append(seg, crossfade=crossfade_duration)
76
+
77
+ final_mix = mix_with_bg_music(combined_spoken, custom_bg_music_path)
78
+
79
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
80
+ final_mix.export(temp_audio.name, format="mp3")
81
+ final_mp3_path = temp_audio.name
82
+
83
+ with open(final_mp3_path, "rb") as f:
84
+ audio_bytes = f.read()
85
+ os.remove(final_mp3_path)
86
+
87
+ return audio_bytes, transcript
88
+
89
+ def generate_podcast(
90
+ research_topic_input,
91
+ tone,
92
+ length_minutes,
93
+ host_name,
94
+ host_desc,
95
+ guest_name,
96
+ guest_desc,
97
+ user_specs,
98
+ sponsor_content,
99
+ sponsor_style,
100
+ custom_bg_music_path
101
+ ):
102
+ if not research_topic_input:
103
+ return None, "Please enter a topic to research for the podcast."
104
+ text = st.session_state.get("report_content", "") # Get report content
105
+ if not text:
106
+ return None, "Please generate a research report first, or enter a topic."
107
+
108
+ extra_instructions = []
109
+ if host_name or guest_name:
110
+ host_line = f"Host: {host_name or 'Jane'} - {host_desc or 'a curious host'}."
111
+ guest_line = f"Guest: {guest_name or 'John'} - {guest_desc or 'an expert'}."
112
+ extra_instructions.append(f"{host_line}\n{guest_line}")
113
+
114
+ if user_specs.strip():
115
+ extra_instructions.append(f"Additional User Instructions: {user_specs}")
116
+
117
+ if sponsor_content.strip():
118
+ extra_instructions.append(
119
+ f"Sponsor Content Provided (should be under ~30 seconds):\n{sponsor_content}"
120
+ )
121
+
122
+ combined_instructions = "\n\n".join(extra_instructions).strip()
123
+ full_prompt = SYSTEM_PROMPT
124
+ if combined_instructions:
125
+ full_prompt += f"\n\n# Additional Instructions\n{combined_instructions}\n"
126
+
127
+ # Add language-specific instructions
128
+ if st.session_state.get("language_selection") == "Hinglish":
129
+ full_prompt += "\n\nPlease generate the script in Romanized Hindi.\n"
130
+ # Add similar instruction here for Hindi
131
+
132
+ try:
133
+ script = generate_script(
134
+ full_prompt,
135
+ text,
136
+ tone,
137
+ f"{length_minutes} Mins",
138
+ host_name=host_name or "Jane",
139
+ guest_name=guest_name or "John",
140
+ sponsor_style=sponsor_style,
141
+ sponsor_provided=bool(sponsor_content.strip())
142
+ )
143
+ # If language is Hinglish, transliterate script dialogues to IAST
144
+ if st.session_state.get("language_selection") == "Hinglish":
145
+ from indic_transliteration.sanscript import transliterate, DEVANAGARI, IAST
146
+ for dialogue_item in script.dialogue:
147
+ dialogue_item.text = transliterate(dialogue_item.text, DEVANAGARI, IAST)
148
+ except Exception as e:
149
+ return None, f"Error generating script: {str(e)}"
150
+
151
+ audio_segments = []
152
+ transcript = ""
153
+ crossfade_duration = 50
154
+
155
+ try:
156
+ for item in script.dialogue:
157
+ language = st.session_state.get("language_selection", "English (American)")
158
+ if language in ["English (Indian)", "Hinglish", "Hindi"]:
159
+ tts_speaker = "John" if item.display_speaker.lower() == (guest_name or "John").lower() else "Jane"
160
+ else:
161
+ tts_speaker = item.speaker
162
+
163
+ audio_file = generate_audio_mp3(item.text, tts_speaker)
164
+ seg = AudioSegment.from_file(audio_file, format="mp3")
165
+ audio_segments.append(seg)
166
+ transcript += f"**{item.display_speaker}**: {item.text}\n\n"
167
+ os.remove(audio_file)
168
+
169
+ if not audio_segments:
170
+ return None, "No audio segments generated."
171
+
172
+ combined_spoken = audio_segments[0]
173
+ for seg in audio_segments[1:]:
174
+ combined_spoken = combined_spoken.append(seg, crossfade=crossfade_duration)
175
+
176
+ final_mix = mix_with_bg_music(combined_spoken, custom_bg_music_path)
177
+
178
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
179
+ final_mix.export(temp_audio.name, format="mp3")
180
+ final_mp3_path = temp_audio.name
181
+
182
+ with open(final_mp3_path, "rb") as f:
183
+ audio_bytes = f.read()
184
+ os.remove(final_mp3_path)
185
+
186
+ return audio_bytes, transcript
187
+ except Exception as e:
188
+ return None, f"Error generating audio: {str(e)}"
189
+
190
+ def highlight_differences(original: str, edited: str) -> str:
191
+ matcher = difflib.SequenceMatcher(None, original.split(), edited.split())
192
+ highlighted = []
193
+ for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
194
+ if opcode == 'equal':
195
+ highlighted.extend(original.split()[i1:i2])
196
+ elif opcode in ('replace', 'insert'):
197
+ added_words = edited.split()[j1:j2]
198
+ highlighted.extend([f'<span style="color:red">{word}</span>' for word in added_words])
199
+ elif opcode == 'delete':
200
+ pass
201
+ return ' '.join(highlighted)
202
+
203
+ def main():
204
+ st.set_page_config(
205
+ page_title="MyPod v3: AI-Powered Podcast & Research",
206
+ layout="centered"
207
+ )
208
+
209
+ st.markdown("""
210
+ <style>
211
+ .stFileUploader>div>div>div {
212
+ transform: scale(0.9);
213
+ }
214
+ footer {
215
+ text-align: center;
216
+ padding: 1em 0;
217
+ font-size: 0.8em;
218
+ color: #888;
219
+ }
220
+ </style>
221
+ """, unsafe_allow_html=True)
222
+
223
+ logo_col, title_col = st.columns([1, 10])
224
+ with logo_col:
225
+ st.image("logomypod.jpg", width=70)
226
+ with title_col:
227
+ st.markdown("## MyPod v3: AI-Powered Podcast & Research")
228
+
229
+ st.markdown("""
230
+ Welcome to **MyPod**, your go-to AI-powered podcast generator and research report tool! 🎉
231
+ MyPod now offers two main functionalities:
232
+
233
+ 1. **Generate Research Reports:** Provide a research topic, and MyPod will use its AI-powered research agent to create a comprehensive, well-structured research report in PDF format.
234
+ 2. **Generate Podcasts:** Transform your research topic (or the generated report) into an engaging, human-sounding podcast.
235
+
236
+ Select your desired mode below and let the magic happen!
237
+ """)
238
+
239
+ with st.expander("How to Use"):
240
+ st.markdown("""
241
+ **For Research Reports:**
242
+
243
+ <ol style="font-size:18px;">
244
+ <li>Select "Generate Research Report".</li>
245
+ <li>Enter your research topic.</li>
246
+ <li>Click 'Generate Report'.</li>
247
+ <li>MyPod will use its AI agent to research the topic and create a PDF report.</li>
248
+ <li>Once generated, you can view and download the report.</li>
249
+ </ol>
250
+
251
+ **For Podcasts:**
252
+
253
+ <ol style="font-size:18px;">
254
+ <li>Select "Generate Podcast".</li>
255
+ <li>Enter the research topic (this will be used as the basis for the podcast). OR FIRST GENERATE A REPORT AND THEN SELECT PODCAST.</li>
256
+ <li>Choose the tone, language, and target duration.</li>
257
+ <li>Add custom names and descriptions for the speakers (optional).</li>
258
+ <li>Add sponsored content (optional).</li>
259
+ <li>Click 'Generate Podcast'.</li>
260
+ </ol>
261
+
262
+ """, unsafe_allow_html=True)
263
+ # --- Main Mode Selection ---
264
+ mode = st.radio("Choose a Mode:", ["Generate Research Report", "Generate Podcast"])
265
+
266
+ # --- Research Report Section ---
267
+ if mode == "Generate Research Report":
268
+ st.markdown("### Generate Research Report")
269
+ research_topic_input = st.text_input("Enter your research topic:")
270
+ report_button = st.button("Generate Report")
271
+
272
+ if report_button:
273
+ if not research_topic_input:
274
+ st.error("Please enter a research topic.")
275
+ else:
276
+ with st.spinner("Researching and generating report... This may take several minutes."):
277
+ try:
278
+ report_content = run_research_agent(research_topic_input)
279
+ st.session_state["report_content"] = report_content
280
+
281
+ # Display report (basic text for now)
282
+ st.markdown("### Generated Report Preview")
283
+ st.text_area("Report Content", value=report_content, height=300)
284
+
285
+ # Generate PDF and offer download
286
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmpfile:
287
+ pdf_path = tmpfile.name
288
+ generate_report(report_content, filename=pdf_path) # Generate PDF
289
+
290
+ with open(pdf_path, "rb") as f:
291
+ pdf_bytes = f.read()
292
+ os.remove(pdf_path) # Clean up temp file
293
+
294
+ st.download_button(
295
+ label="Download Report (PDF)",
296
+ data=pdf_bytes,
297
+ file_name=f"{research_topic_input}_report.pdf",
298
+ mime="application/pdf"
299
+ )
300
+ st.success("Report generated successfully!")
301
+
302
+ except Exception as e:
303
+ st.error(f"An error occurred: {e}")
304
+
305
+ # --- Podcast Generation Section ---
306
+
307
+ elif mode == "Generate Podcast":
308
+ st.markdown("### Generate Podcast")
309
+
310
+ research_topic_input = st.text_input("Enter research topic for the podcast (or use a generated report):")
311
+ tone = st.radio("Tone", ["Casual", "Formal", "Humorous", "Youthful"], index=0)
312
+ length_minutes = st.slider("Podcast Length (in minutes)", 1, 60, 3)
313
+
314
+ language = st.selectbox(
315
+ "Choose Language and Accent",
316
+ ["English (American)", "English (Indian)", "Hinglish", "Hindi"],
317
+ index=0
318
+ )
319
+ st.session_state["language_selection"] = language
320
+
321
+ st.markdown("### Customize Your Podcast (Optional)")
322
+
323
+ with st.expander("Set Host & Guest Names/Descriptions (Optional)"):
324
+ host_name = st.text_input("Female Host Name (leave blank for 'Jane')")
325
+ host_desc = st.text_input("Female Host Description (Optional)")
326
+ guest_name = st.text_input("Male Guest Name (leave blank for 'John')")
327
+ guest_desc = st.text_input("Male Guest Description (Optional)")
328
+
329
+ user_specs = st.text_area("Any special instructions or prompts for the script? (Optional)", "")
330
+ sponsor_content = st.text_area("Sponsored Content / Ad (Optional)", "")
331
+ sponsor_style = st.selectbox("Sponsor Integration Style", ["Separate Break", "Blended"])
332
+
333
+ custom_bg_music_file = st.file_uploader("Upload Custom Background Music (Optional)", type=["mp3", "wav"])
334
+ custom_bg_music_path = None
335
+ if custom_bg_music_file:
336
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(custom_bg_music_file.name)[1]) as tmp:
337
+ tmp.write(custom_bg_music_file.read())
338
+ custom_bg_music_path = tmp.name
339
+
340
+ if "audio_bytes" not in st.session_state:
341
+ st.session_state["audio_bytes"] = None
342
+ if "transcript" not in st.session_state:
343
+ st.session_state["transcript"] = None
344
+ if "transcript_original" not in st.session_state:
345
+ st.session_state["transcript_original"] = None
346
+ if "qa_count" not in st.session_state:
347
+ st.session_state["qa_count"] = 0
348
+ if "conversation_history" not in st.session_state:
349
+ st.session_state["conversation_history"] = ""
350
+
351
+ generate_button = st.button("Generate Podcast")
352
+
353
+ if generate_button:
354
+ progress_bar = st.progress(0)
355
+ progress_text = st.empty()
356
+
357
+ progress_messages = [
358
+ "🔍 Analyzing your input...",
359
+ "📝 Crafting the perfect script...",
360
+ "🎙️ Generating high-quality audio...",
361
+ "🎶 Adding the finishing touches..."
362
+ ]
363
+
364
+ progress_text.write(progress_messages[0])
365
+ progress_bar.progress(0)
366
+ time.sleep(1.0)
367
+
368
+ progress_text.write(progress_messages[1])
369
+ progress_bar.progress(25)
370
+ time.sleep(1.0)
371
+
372
+ progress_text.write(progress_messages[2])
373
+ progress_bar.progress(50)
374
+ time.sleep(1.0)
375
+
376
+ progress_text.write(progress_messages[3])
377
+ progress_bar.progress(75)
378
+ time.sleep(1.0)
379
+
380
+ audio_bytes, transcript = generate_podcast(
381
+ research_topic_input,
382
+ tone,
383
+ length_minutes,
384
+ host_name,
385
+ host_desc,
386
+ guest_name,
387
+ guest_desc,
388
+ user_specs,
389
+ sponsor_content,
390
+ sponsor_style,
391
+ custom_bg_music_path
392
+ )
393
+
394
+ progress_bar.progress(100)
395
+ progress_text.write("✅ Done!")
396
+
397
+ if audio_bytes is None:
398
+ st.error(transcript)
399
+ st.session_state["audio_bytes"] = None
400
+ st.session_state["transcript"] = None
401
+ st.session_state["transcript_original"] = None
402
+ else:
403
+ st.success("Podcast generated successfully!")
404
+ st.session_state["audio_bytes"] = audio_bytes
405
+ st.session_state["transcript"] = transcript
406
+ st.session_state["transcript_original"] = transcript
407
+ st.session_state["qa_count"] = 0
408
+ st.session_state["conversation_history"] = ""
409
+
410
+ if st.session_state.get("audio_bytes"):
411
+ st.audio(st.session_state["audio_bytes"], format='audio/mp3')
412
+ st.download_button(
413
+ label="Download Podcast (MP3)",
414
+ data=st.session_state["audio_bytes"],
415
+ file_name="my_podcast.mp3",
416
+ mime="audio/mpeg"
417
+ )
418
+
419
+ st.markdown("### Generated Transcript (Editable)")
420
+ edited_text = st.text_area(
421
+ "Feel free to tweak lines, fix errors, or reword anything.",
422
+ value=st.session_state["transcript"],
423
+ height=300
424
+ )
425
+
426
+ if st.session_state.get("transcript_original"):
427
+ highlighted_transcript = highlight_differences(
428
+ st.session_state["transcript_original"],
429
+ edited_text
430
+ )
431
+ st.markdown("### **Edited Transcript Highlights**", unsafe_allow_html=True)
432
+ st.markdown(highlighted_transcript, unsafe_allow_html=True)
433
+
434
+ if st.button("Regenerate Audio From Edited Text"):
435
+ regen_bar = st.progress(0)
436
+ regen_text = st.empty()
437
+
438
+ regen_text.write("🔄 Regenerating your podcast with the edits...")
439
+ regen_bar.progress(25)
440
+ time.sleep(1.0)
441
+
442
+ regen_text.write("🔧 Adjusting the script based on your changes...")
443
+ regen_bar.progress(50)
444
+ time.sleep(1.0)
445
+
446
+ dialogue_items = parse_user_edited_transcript(
447
+ edited_text,
448
+ host_name or "Jane",
449
+ guest_name or "John"
450
+ )
451
+ new_audio_bytes, new_transcript = regenerate_audio_from_dialogue(dialogue_items, custom_bg_music_path)
452
+
453
+ regen_bar.progress(75)
454
+ time.sleep(1.0)
455
+
456
+ if new_audio_bytes is None:
457
+ regen_bar.progress(100)
458
+ st.error(new_transcript)
459
+ else:
460
+ regen_bar.progress(100)
461
+ regen_text.write("✅ Regeneration complete!")
462
+ st.success("Regenerated audio below:")
463
+
464
+ st.session_state["audio_bytes"] = new_audio_bytes
465
+ st.session_state["transcript"] = new_transcript
466
+ st.session_state["transcript_original"] = new_transcript
467
+
468
+ st.audio(new_audio_bytes, format='audio/mp3')
469
+ st.download_button(
470
+ label="Download Edited Podcast (MP3)",
471
+ data=new_audio_bytes,
472
+ file_name="my_podcast_edited.mp3",
473
+ mime="audio/mpeg"
474
+ )
475
+ st.markdown("### Updated Transcript")
476
+ st.markdown(new_transcript)
477
+
478
+ st.markdown("## Post-Podcast Q&A")
479
+ used_questions = st.session_state.get("qa_count", 0)
480
+ remaining = MAX_QA_QUESTIONS - used_questions
481
+
482
+ if remaining > 0:
483
+ st.write(f"You can ask up to {remaining} more question(s).")
484
+
485
+ typed_q = st.text_input("Type your follow-up question:")
486
+ audio_q = st.audio_input("Or record an audio question (WAV)")
487
+
488
+ if st.button("Submit Q&A"):
489
+ if used_questions >= MAX_QA_QUESTIONS:
490
+ st.warning("You have reached the Q&A limit.")
491
+ else:
492
+ question_text = typed_q.strip()
493
+ if audio_q is not None:
494
+ suffix = ".wav"
495
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
496
+ tmp.write(audio_q.read())
497
+ local_audio_path = tmp.name
498
+ st.write("Transcribing your audio question...")
499
+ audio_transcript = transcribe_audio_deepgram(local_audio_path)
500
+ if audio_transcript:
501
+ question_text = audio_transcript
502
+
503
+ if not question_text:
504
+ st.warning("No question found (text or audio).")
505
+ else:
506
+ st.write("Generating an answer...")
507
+ ans_audio, ans_text = handle_qa_exchange(question_text)
508
+ if ans_audio:
509
+ st.audio(ans_audio, format='audio/mp3')
510
+ st.markdown(f"**John**: {ans_text}")
511
+ st.session_state["qa_count"] = used_questions + 1
512
+ else:
513
+ st.warning("No response could be generated.")
514
+ else:
515
+ st.write("You have used all 5 Q&A opportunities.")
516
+
517
+ st.markdown("<footer>©2025 MyPod. All rights reserved.</footer>", unsafe_allow_html=True)
518
+
519
+ if __name__ == "__main__":
520
+ main()
prompts.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # prompts.py
2
+
3
+ SYSTEM_PROMPT = (
4
+ "You are a skilled podcast producer tasked with transforming unstructured or messy input text into an engaging "
5
+ "and informative podcast script. Your goal is to extract the most interesting and insightful content for a "
6
+ "compelling podcast discussion. Critically, you must incorporate both established background information (e.g., "
7
+ "from an LLM knowledge base or Wikipedia) AND you must include any new or breaking news items found through RSS "
8
+ "feeds or other sources.\n\n"
9
+
10
+ "Steps to Follow:\n"
11
+ "1. **Analyze the Input:** Carefully examine the text, identifying key topics, points, recent developments, and "
12
+ "interesting facts or anecdotes that could drive an engaging podcast conversation. Disregard irrelevant or "
13
+ "duplicate information.\n"
14
+ "2. **Brainstorm Ideas:** Consider creative ways to present the key points in a lively, entertaining manner, "
15
+ "incorporating the latest news or any recently discovered updates.\n"
16
+ "3. **Craft the Dialogue:**\n"
17
+ " - **Warm Opening**: Have Jane (the host) welcome listeners, introduce the podcast name, and greet the guest. "
18
+ " Provide some quick background on John’s expertise or credentials.\n"
19
+ " - **Main Discussion**: Discuss the key points thoroughly, including new/breaking news items or any fresh "
20
+ " details from the topic’s latest developments. Jane asks thoughtful questions; John responds with "
21
+ " well-substantiated facts and relevant news. Be sure to highlight if there are significant changes, such "
22
+ " as a resignation or other major events.\n"
23
+ " - **Pleasant Conclusion**: End the episode in a friendly way, with Jane wrapping up and thanking the audience, "
24
+ " possibly directing them to future updates if the topic is ongoing.\n\n"
25
+
26
+ "**Rules for the Dialogue:**\n"
27
+ "- Jane always initiates the conversation and interviews John.\n"
28
+ "- Include thoughtful questions from Jane to guide the discussion.\n"
29
+ "- Incorporate natural speech patterns, including occasional verbal fillers (e.g., 'um,' 'well,' 'you know').\n"
30
+ "- Allow for natural interruptions and back-and-forth between Jane and John.\n"
31
+ "- If any new or updated info is found (e.g., a resignation), it must be mentioned and integrated into the flow.\n"
32
+ "- Ensure John's responses are on-topic and substantiated by the input text and any newly discovered or breaking "
33
+ " news.\n"
34
+ "- Maintain a PG-rated conversation appropriate for all audiences.\n"
35
+ "- Avoid any marketing or self-promotional content from John.\n"
36
+ "- Jane concludes the conversation in a pleasant manner, possibly teasing future updates if the topic is still "
37
+ " evolving.\n\n"
38
+
39
+ "**Stylistic Guidelines for Natural Dialogue:**\n"
40
+ "- The dialogue should sound natural and conversational between Jane and John.\n"
41
+ "- Use a mix of short, punchy sentences along with longer, reflective sentences to create a dynamic rhythm.\n"
42
+ "- Include natural pauses and breaks to mimic human speech, using ellipses (...) or sentence fragments where "
43
+ " appropriate.\n"
44
+ "- Vary sentence structures to avoid monotony; mix questions, statements, and exclamations.\n"
45
+ "- Inject humor or light-hearted comments to enhance relatability and keep the tone friendly.\n"
46
+ "- Predominantly use active voice to create a direct and engaging conversation.\n"
47
+ "- Add emotional inflections reflecting excitement, curiosity, or contemplation as needed.\n"
48
+ "- Occasionally include filler words like 'um' or 'you know' to enhance authenticity, but avoid overuse.\n"
49
+ "- Ensure Jane and John occasionally acknowledge each other with phrases like 'That's a great point!' or "
50
+ " 'I totally agree!' to simulate a real conversation.\n\n"
51
+
52
+ "The goal is to create an audio output that feels lively, relatable, and easy for listeners to follow.\n\n"
53
+
54
+ "# Additional Instruction for Interjections / Interruptions\n"
55
+ "Please include occasional, short interruptions or interjections where Jane or John might briefly cut in on "
56
+ "the other’s sentence (without overlapping audio). For example, they might say, 'Wait, wait...' or 'Hold on...' "
57
+ "to jump in, and then politely yield so the conversation remains understandable in sequence.\n"
58
+ )
qa.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # qa.py
2
+
3
+ import os
4
+ import requests
5
+ import json
6
+ import tempfile
7
+ import streamlit as st
8
+
9
+ from utils import generate_audio_mp3 # Reuse your existing TTS function
10
+
11
+ def transcribe_audio_deepgram(local_audio_path: str) -> str:
12
+ """
13
+ Sends a local audio file to Deepgram for STT.
14
+ Returns the transcript text if successful, or raises an error if failed.
15
+ """
16
+ DEEPGRAM_API_KEY = os.environ.get("DEEPGRAM_API_KEY")
17
+ if not DEEPGRAM_API_KEY:
18
+ raise ValueError("Deepgram API key not found in environment variables.")
19
+
20
+ url = "https://api.deepgram.com/v1/listen?model=nova-2&smart_format=true"
21
+ headers = {
22
+ "Authorization": f"Token {DEEPGRAM_API_KEY}",
23
+ "Content-Type": "audio/wav"
24
+ }
25
+
26
+ with open(local_audio_path, "rb") as f:
27
+ response = requests.post(url, headers=headers, data=f)
28
+ response.raise_for_status()
29
+
30
+ data = response.json()
31
+ # Extract the transcript
32
+ transcript = data["results"]["channels"][0]["alternatives"][0].get("transcript", "")
33
+ return transcript
34
+
35
+
36
+ def call_llm_for_qa(conversation_so_far: str, user_question: str) -> dict:
37
+ """
38
+ Calls Groq LLM to answer a follow-up question.
39
+ Returns a Python dict: {"speaker": "John", "text": "..."}
40
+ """
41
+ system_prompt = f"""
42
+ You are John, the guest speaker. The user is asking a follow-up question.
43
+ Conversation so far:
44
+ {conversation_so_far}
45
+
46
+ New user question:
47
+ {user_question}
48
+
49
+ Please respond in JSON with keys "speaker" and "text", e.g.:
50
+ {{ "speaker": "John", "text": "Sure, here's my answer..." }}
51
+ """
52
+
53
+ from utils import call_groq_api_for_qa # Import from utils
54
+
55
+ raw_json_response = call_groq_api_for_qa(system_prompt) # Corrected call
56
+ # Expect a JSON string: {"speaker": "John", "text": "some short answer"}
57
+ response_dict = json.loads(raw_json_response)
58
+ return response_dict
59
+
60
+
61
+ def handle_qa_exchange(user_question: str) -> (bytes, str):
62
+ """
63
+ 1) Read conversation_so_far from session_state
64
+ 2) Call the LLM for a short follow-up answer
65
+ 3) Generate TTS audio
66
+ 4) Return (audio_bytes, answer_text)
67
+ """
68
+ conversation_so_far = st.session_state.get("conversation_history", "")
69
+
70
+ # Ask the LLM
71
+ response_dict = call_llm_for_qa(conversation_so_far, user_question)
72
+ answer_text = response_dict.get("text", "")
73
+ speaker = response_dict.get("speaker", "John")
74
+
75
+ # Update conversation
76
+ new_history = conversation_so_far + f"\nUser: {user_question}\n{speaker}: {answer_text}\n"
77
+ st.session_state["conversation_history"] = new_history
78
+
79
+ if not answer_text.strip():
80
+ return (None, "")
81
+
82
+ # TTS
83
+ audio_file_path = generate_audio_mp3(answer_text, "John") # always John
84
+ with open(audio_file_path, "rb") as f:
85
+ audio_bytes = f.read()
86
+ os.remove(audio_file_path)
87
+
88
+ return (audio_bytes, answer_text)
utils.py ADDED
@@ -0,0 +1,641 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import requests
5
+ import tempfile
6
+ import random
7
+ import numpy as np
8
+ import torch
9
+ import time
10
+
11
+ from bs4 import BeautifulSoup
12
+ from typing import List, Literal, Optional
13
+ from pydantic import BaseModel
14
+ from pydub import AudioSegment, effects
15
+ from transformers import pipeline
16
+ import tiktoken
17
+ from groq import Groq
18
+
19
+ import streamlit as st # If you use Streamlit for session state
20
+
21
+ from report_structure import generate_report # Your PDF generator
22
+ from tavily import TavilyClient # For search
23
+
24
+
25
+ ###############################################################################
26
+ # DATA MODELS
27
+ ###############################################################################
28
+
29
+ class DialogueItem(BaseModel):
30
+ speaker: Literal["Jane", "John"]
31
+ display_speaker: str = "Jane"
32
+ text: str
33
+
34
+ class Dialogue(BaseModel):
35
+ dialogue: List[DialogueItem]
36
+
37
+
38
+ ###############################################################################
39
+ # HYBRID RATE-LIMIT HANDLER
40
+ ###############################################################################
41
+
42
+ def call_llm_with_retry(groq_client, **payload):
43
+ """
44
+ Wraps groq_client.chat.completions.create(**payload) in a retry loop
45
+ to catch 429 rate-limit errors. If we see “try again in XXs,” we parse
46
+ that wait time, sleep, then retry. We also do a short sleep (0.3s)
47
+ after each successful call to spread usage.
48
+ """
49
+ max_retries = 3
50
+ for attempt in range(max_retries):
51
+ try:
52
+ print(f"[DEBUG] call_llm_with_retry attempt {attempt+1}")
53
+ response = groq_client.chat.completions.create(**payload)
54
+ # Short sleep to avoid bursting usage
55
+ time.sleep(0.3)
56
+ print("[DEBUG] LLM call succeeded, returning response.")
57
+ return response
58
+ except Exception as e:
59
+ err_str = str(e).lower()
60
+ print(f"[WARN] call_llm_with_retry attempt {attempt+1} failed: {e}")
61
+ if "rate_limit_exceeded" in err_str or "try again in" in err_str:
62
+ # parse recommended wait time
63
+ wait_time = 60.0
64
+ match = re.search(r'try again in (\d+(?:\.\d+)?)s', str(e), re.IGNORECASE)
65
+ if match:
66
+ wait_time = float(match.group(1)) + 1.0
67
+ print(f"[WARN] Rate limited. Sleeping for {wait_time:.1f}s, then retrying.")
68
+ time.sleep(wait_time)
69
+ else:
70
+ raise
71
+ raise RuntimeError("Exceeded max_retries due to repeated rate limit or other errors.")
72
+
73
+
74
+ ###############################################################################
75
+ # TRUNCATION
76
+ ###############################################################################
77
+
78
+ def truncate_text_tokens(text: str, max_tokens: int) -> str:
79
+ """
80
+ Truncates 'text' to 'max_tokens' tokens. Used for controlling maximum
81
+ total text size after scraping.
82
+ """
83
+ tokenizer = tiktoken.get_encoding("cl100k_base")
84
+ tokens = tokenizer.encode(text)
85
+ if len(tokens) > max_tokens:
86
+ truncated = tokenizer.decode(tokens[:max_tokens])
87
+ print(f"[DEBUG] Truncating from {len(tokens)} tokens to {max_tokens} tokens.")
88
+ return truncated
89
+ return text
90
+
91
+ def truncate_text_for_llm(text: str, max_tokens: int = 1024) -> str:
92
+ """
93
+ Typical truncation for partial merges or final calls.
94
+ """
95
+ tokenizer = tiktoken.get_encoding("cl100k_base")
96
+ tokens = tokenizer.encode(text)
97
+ if len(tokens) > max_tokens:
98
+ truncated = tokenizer.decode(tokens[:max_tokens])
99
+ print(f"[DEBUG] Truncating text from {len(tokens)} to {max_tokens} tokens for LLM.")
100
+ return truncated
101
+ return text
102
+
103
+
104
+ ###############################################################################
105
+ # PITCH SHIFT (Optional)
106
+ ###############################################################################
107
+
108
+ def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
109
+ print(f"[LOG] Shifting pitch by {semitones} semitones.")
110
+ new_sample_rate = int(audio.frame_rate * (2.0 ** (semitones / 12.0)))
111
+ shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
112
+ return shifted_audio.set_frame_rate(audio.frame_rate)
113
+
114
+
115
+ ###############################################################################
116
+ # PODCAST SCRIPT GENERATION (Single Call)
117
+ ###############################################################################
118
+
119
+ def generate_script(
120
+ system_prompt: str,
121
+ input_text: str,
122
+ tone: str,
123
+ target_length: str,
124
+ host_name: str = "Jane",
125
+ guest_name: str = "John",
126
+ sponsor_style: str = "Separate Break",
127
+ sponsor_provided=None
128
+ ):
129
+ """
130
+ If you do a single call to generate the entire script.
131
+ Uses DEEPSEEK_R1. Just ensure you parse the JSON.
132
+ """
133
+ print("[LOG] Generating script with tone:", tone, "and length:", target_length)
134
+
135
+ language_selection = st.session_state.get("language_selection", "English (American)")
136
+ if (host_name == "Jane" or not host_name) and language_selection in ["English (Indian)", "Hinglish", "Hindi"]:
137
+ host_name = "Isha"
138
+ if (guest_name == "John" or not guest_name) and language_selection in ["English (Indian)", "Hinglish", "Hindi"]:
139
+ guest_name = "Aarav"
140
+
141
+ words_per_minute = 150
142
+ numeric_minutes = 3
143
+ match = re.search(r"(\d+)", target_length)
144
+ if match:
145
+ numeric_minutes = int(match.group(1))
146
+
147
+ min_words = max(50, numeric_minutes * 100)
148
+ max_words = numeric_minutes * words_per_minute
149
+
150
+ tone_map = {
151
+ "Humorous": "funny and exciting, makes people chuckle",
152
+ "Formal": "business-like, well-structured, professional",
153
+ "Casual": "like a conversation between close friends, relaxed and informal",
154
+ "Youthful": "like how teenagers might chat, energetic and lively"
155
+ }
156
+ chosen_tone = tone_map.get(tone, "casual")
157
+
158
+ if sponsor_provided:
159
+ if sponsor_style == "Separate Break":
160
+ sponsor_instructions = (
161
+ "If sponsor content is provided, include it in a separate ad break (~30 seconds). "
162
+ "Use 'Now a word from our sponsor...' and end with 'Back to the show', etc."
163
+ )
164
+ else:
165
+ sponsor_instructions = (
166
+ "If sponsor content is provided, blend it naturally (~30 seconds) into conversation. "
167
+ "Avoid abrupt transitions."
168
+ )
169
+ else:
170
+ sponsor_instructions = ""
171
+
172
+ prompt = (
173
+ f"{system_prompt}\n"
174
+ f"TONE: {chosen_tone}\n"
175
+ f"TARGET LENGTH: {target_length} (~{min_words}-{max_words} words)\n"
176
+ f"INPUT TEXT: {input_text}\n\n"
177
+ f"# Sponsor Style Instruction:\n{sponsor_instructions}\n\n"
178
+ "Please provide the output in the following JSON format without any extra text:\n"
179
+ "{\n"
180
+ ' "dialogue": [\n'
181
+ ' { "speaker": "Jane", "text": "..." },\n'
182
+ ' { "speaker": "John", "text": "..." }\n'
183
+ " ]\n"
184
+ "}"
185
+ )
186
+ if language_selection == "Hinglish":
187
+ prompt += "\n\nPlease generate the script in Romanized Hindi.\n"
188
+ elif language_selection == "Hindi":
189
+ prompt += "\n\nPlease generate the script exclusively in Hindi.\n"
190
+
191
+ print("[LOG] Sending script generation prompt to LLM.")
192
+ try:
193
+ headers = {
194
+ "Authorization": f"Bearer {os.environ.get('DEEPSEEK_API_KEY')}",
195
+ "Content-Type": "application/json"
196
+ }
197
+ data = {
198
+ "model": "deepseek/deepseek-r1",
199
+ "messages": [{"role": "user", "content": prompt}],
200
+ "max_tokens": 2048,
201
+ "temperature": 0.7
202
+ }
203
+ resp = requests.post("https://openrouter.ai/api/v1/chat/completions",
204
+ headers=headers, data=json.dumps(data))
205
+ resp.raise_for_status()
206
+ raw_content = resp.json()["choices"][0]["message"]["content"].strip()
207
+ except Exception as e:
208
+ print("[ERROR] LLM error generating script:", e)
209
+ raise ValueError(f"Error generating script: {str(e)}")
210
+
211
+ start_idx = raw_content.find("{")
212
+ end_idx = raw_content.rfind("}")
213
+ if start_idx == -1 or end_idx == -1:
214
+ raise ValueError("No JSON found in LLM response for script generation.")
215
+
216
+ json_str = raw_content[start_idx:end_idx+1]
217
+ try:
218
+ data_js = json.loads(json_str)
219
+ dialogue_list = data_js.get("dialogue", [])
220
+
221
+ # Adjust speaker names if they match
222
+ for d in dialogue_list:
223
+ raw_speaker = d.get("speaker", "Jane")
224
+ if raw_speaker.lower() == host_name.lower():
225
+ d["speaker"] = "Jane"
226
+ d["display_speaker"] = host_name
227
+ elif raw_speaker.lower() == guest_name.lower():
228
+ d["speaker"] = "John"
229
+ d["display_speaker"] = guest_name
230
+ else:
231
+ d["speaker"] = "Jane"
232
+ d["display_speaker"] = raw_speaker
233
+
234
+ new_dialogue_items = []
235
+ for d in dialogue_list:
236
+ if "display_speaker" not in d:
237
+ d["display_speaker"] = d["speaker"]
238
+ new_dialogue_items.append(DialogueItem(**d))
239
+
240
+ return Dialogue(dialogue=new_dialogue_items)
241
+
242
+ except json.JSONDecodeError as e:
243
+ print("[ERROR] JSON decoding failed for script generation:", e)
244
+ raise ValueError(f"Script parse error: {str(e)}")
245
+ except Exception as e:
246
+ print("[ERROR] Unknown error parsing script JSON:", e)
247
+ raise ValueError(f"Script parse error: {str(e)}")
248
+
249
+
250
+ ###############################################################################
251
+ # YOUTUBE TRANSCRIPTION (RAPIDAPI)
252
+ ###############################################################################
253
+
254
+ def transcribe_youtube_video(video_url: str) -> str:
255
+ print("[LOG] Transcribing YouTube video:", video_url)
256
+ match = re.search(r"(?:v=|/)([0-9A-Za-z_-]{11})", video_url)
257
+ if not match:
258
+ raise ValueError(f"Invalid YouTube URL: {video_url}, cannot extract video ID.")
259
+ video_id = match.group(1)
260
+ print("[LOG] Extracted video ID:", video_id)
261
+
262
+ base_url = "https://youtube-transcriptor.p.rapidapi.com/transcript"
263
+ params = {"video_id": video_id, "lang": "en"}
264
+ headers = {
265
+ "x-rapidapi-host": "youtube-transcriptor.p.rapidapi.com",
266
+ "x-rapidapi-key": os.environ.get("RAPIDAPI_KEY")
267
+ }
268
+ try:
269
+ resp = requests.get(base_url, headers=headers, params=params, timeout=30)
270
+ resp.raise_for_status()
271
+ data = resp.json()
272
+ if not isinstance(data, list) or not data:
273
+ raise ValueError(f"Unexpected transcript format or empty transcript: {data}")
274
+
275
+ transcript_as_text = data[0].get("transcriptionAsText", "").strip()
276
+ if not transcript_as_text:
277
+ raise ValueError("transcriptionAsText missing or empty in RapidAPI response.")
278
+
279
+ print("[LOG] Transcript retrieval successful. Sample:", transcript_as_text[:200], "...")
280
+ return transcript_as_text
281
+ except Exception as e:
282
+ print("[ERROR] YouTube transcription error:", e)
283
+ raise ValueError(f"Error transcribing YouTube video: {str(e)}")
284
+
285
+
286
+ ###############################################################################
287
+ # AUDIO GENERATION (TTS) AND BG MUSIC MIX
288
+ ###############################################################################
289
+
290
+ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
291
+ text = re.sub(r"\bNo\.\b", "Number", text, flags=re.IGNORECASE)
292
+ text = re.sub(r"\b(?i)SaaS\b", "sass", text)
293
+
294
+ abbreviations_as_words = {"NASA", "NATO", "UNESCO"}
295
+ def insert_periods_for_abbrev(m):
296
+ abbr = m.group(0)
297
+ if abbr in abbreviations_as_words:
298
+ return abbr
299
+ return ".".join(list(abbr)) + "."
300
+
301
+ text = re.sub(r"\b([A-Z]{2,})\b", insert_periods_for_abbrev, text)
302
+ text = re.sub(r"\.\.", ".", text)
303
+
304
+ def remove_periods_for_tts(m):
305
+ return m.group().replace(".", " ").strip()
306
+
307
+ text = re.sub(r"[A-Z]\.[A-Z](?:\.[A-Z])*\.", remove_periods_for_tts, text)
308
+ text = re.sub(r"-", " ", text)
309
+ text = re.sub(r"\b(ha(ha)?|heh|lol)\b", "(* laughs *)", text, flags=re.IGNORECASE)
310
+ text = re.sub(r"\bsigh\b", "(* sighs *)", text, flags=re.IGNORECASE)
311
+ text = re.sub(r"\b(groan|moan)\b", "(* groans *)", text, flags=re.IGNORECASE)
312
+
313
+ if speaker != "Jane":
314
+ def insert_thinking_pause(m):
315
+ wd = m.group(1)
316
+ if random.random() < 0.3:
317
+ filler = random.choice(["hmm,", "well,", "let me see,"])
318
+ return f"{wd}..., {filler}"
319
+ else:
320
+ return f"{wd}...,"
321
+ keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
322
+ text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
323
+ conj_pattern = r"\b(and|but|so|because|however)\b"
324
+ text = re.sub(conj_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
325
+
326
+ text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
327
+
328
+ def capitalize_after_sentence(m):
329
+ return m.group().upper()
330
+
331
+ text = re.sub(r'(^\s*\w)|([.!?]\s*\w)', capitalize_after_sentence, text)
332
+ return text.strip()
333
+
334
+ def generate_audio_mp3(text: str, speaker: str) -> str:
335
+ """
336
+ Uses Deepgram (English) or Murf (Indian/Hinglish/Hindi) for TTS.
337
+ """
338
+ print(f"[LOG] Generating TTS for speaker={speaker}")
339
+ language_selection = st.session_state.get("language_selection", "English (American)")
340
+ try:
341
+ if language_selection == "English (American)":
342
+ print("[LOG] Using Deepgram for American English TTS.")
343
+ processed_text = text if speaker in ["Jane", "John"] else _preprocess_text_for_tts(text, speaker)
344
+ deepgram_api_url = "https://api.deepgram.com/v1/speak"
345
+ params = {"model": "aura-asteria-en"} if speaker != "John" else {"model": "aura-zeus-en"}
346
+ headers = {
347
+ "Accept": "audio/mpeg",
348
+ "Content-Type": "application/json",
349
+ "Authorization": f"Token {os.environ.get('DEEPGRAM_API_KEY')}"
350
+ }
351
+ body = {"text": processed_text}
352
+ r = requests.post(deepgram_api_url, params=params, headers=headers, json=body, stream=True)
353
+ r.raise_for_status()
354
+
355
+ content_type = r.headers.get("Content-Type", "")
356
+ if "audio/mpeg" not in content_type:
357
+ raise ValueError("Unexpected content-type from Deepgram TTS.")
358
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as mp3_file:
359
+ for chunk in r.iter_content(chunk_size=8192):
360
+ if chunk:
361
+ mp3_file.write(chunk)
362
+ mp3_path = mp3_file.name
363
+
364
+ audio_seg = AudioSegment.from_file(mp3_path, format="mp3")
365
+ audio_seg = effects.normalize(audio_seg)
366
+ final_mp3_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
367
+ audio_seg.export(final_mp3_path, format="mp3")
368
+ if os.path.exists(mp3_path):
369
+ os.remove(mp3_path)
370
+ return final_mp3_path
371
+
372
+ else:
373
+ print("[LOG] Using Murf API for TTS. Language=", language_selection)
374
+ from indic_transliteration.sanscript import transliterate, DEVANAGARI, IAST
375
+ if language_selection == "Hinglish":
376
+ text = transliterate(text, DEVANAGARI, IAST)
377
+ api_key = os.environ.get("MURF_API_KEY")
378
+ headers = {
379
+ "Content-Type": "application/json",
380
+ "Accept": "application/json",
381
+ "api-key": api_key
382
+ }
383
+ multi_native_locale = "hi-IN" if language_selection in ["Hinglish", "Hindi"] else "en-IN"
384
+ if language_selection == "English (Indian)":
385
+ voice_id = "en-IN-aarav" if speaker == "John" else "en-IN-isha"
386
+ elif language_selection in ["Hindi", "Hinglish"]:
387
+ voice_id = "hi-IN-kabir" if speaker == "John" else "hi-IN-shweta"
388
+ else:
389
+ voice_id = "en-IN-aarav" if speaker == "John" else "en-IN-isha"
390
+
391
+ payload = {
392
+ "audioDuration": 0,
393
+ "channelType": "MONO",
394
+ "encodeAsBase64": False,
395
+ "format": "WAV",
396
+ "modelVersion": "GEN2",
397
+ "multiNativeLocale": multi_native_locale,
398
+ "pitch": 0,
399
+ "pronunciationDictionary": {},
400
+ "rate": 0,
401
+ "sampleRate": 48000,
402
+ "style": "Conversational",
403
+ "text": text,
404
+ "variation": 1,
405
+ "voiceId": voice_id
406
+ }
407
+ r = requests.post("https://api.murf.ai/v1/speech/generate", headers=headers, json=payload)
408
+ r.raise_for_status()
409
+ j = r.json()
410
+ audio_url = j.get("audioFile")
411
+ if not audio_url:
412
+ raise ValueError("No audioFile URL from Murf API.")
413
+ audio_resp = requests.get(audio_url)
414
+ audio_resp.raise_for_status()
415
+
416
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as wav_file:
417
+ wav_file.write(audio_resp.content)
418
+ wav_path = wav_file.name
419
+
420
+ audio_seg = AudioSegment.from_file(wav_path, format="wav")
421
+ audio_seg = effects.normalize(audio_seg)
422
+ final_mp3_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
423
+ audio_seg.export(final_mp3_path, format="mp3")
424
+ os.remove(wav_path)
425
+ return final_mp3_path
426
+ except Exception as e:
427
+ print("[ERROR] TTS generation error:", e)
428
+ raise ValueError(f"Error generating TTS audio: {str(e)}")
429
+
430
+ def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
431
+ """
432
+ Overlays 'spoken' with background music, offset by ~2s, volume lowered.
433
+ """
434
+ if custom_music_path:
435
+ music_path = custom_music_path
436
+ else:
437
+ music_path = "bg_music.mp3"
438
+
439
+ try:
440
+ bg_music = AudioSegment.from_file(music_path, format="mp3")
441
+ except Exception as e:
442
+ print("[ERROR] Failed to load background music:", e)
443
+ return spoken
444
+
445
+ bg_music = bg_music - 18.0
446
+ total_length_ms = len(spoken) + 2000
447
+ looped_music = AudioSegment.empty()
448
+ while len(looped_music) < total_length_ms:
449
+ looped_music += bg_music
450
+ looped_music = looped_music[:total_length_ms]
451
+ final_mix = looped_music.overlay(spoken, position=2000)
452
+ return final_mix
453
+
454
+
455
+ ###############################################################################
456
+ # Q&A UTILITY (POST-PODCAST)
457
+ ###############################################################################
458
+
459
+ def call_groq_api_for_qa(system_prompt: str) -> str:
460
+ """
461
+ Single-step Q&A for post-podcast. Usually short usage => minimal tokens.
462
+ """
463
+ try:
464
+ headers = {
465
+ "Authorization": f"Bearer {os.environ.get('GROQ_API_KEY')}",
466
+ "Content-Type": "application/json",
467
+ "Accept": "application/json"
468
+ }
469
+ data = {
470
+ "model": "deepseek-r1-distill-llama-70b",
471
+ "messages": [{"role": "user", "content": system_prompt}],
472
+ "max_tokens": 512,
473
+ "temperature": 0.7
474
+ }
475
+ r = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=headers, data=json.dumps(data))
476
+ r.raise_for_status()
477
+ return r.json()["choices"][0]["message"]["content"].strip()
478
+ except Exception as e:
479
+ print("[ERROR] Groq QA error:", e)
480
+ fallback = {"speaker": "John", "text": "Sorry, I'm having trouble answering now."}
481
+ return json.dumps(fallback)
482
+
483
+
484
+ ###############################################################################
485
+ # LOW-CALL RESEARCH AGENT (Minimizing LLM Calls)
486
+ ###############################################################################
487
+
488
+ MODEL_SUMMARIZATION = "llama-3.1-8b-instant"
489
+ MODEL_COMBINATION = "deepseek-r1-distill-llama-70b"
490
+
491
+ def run_research_agent(
492
+ topic: str,
493
+ report_type: str = "research_report",
494
+ max_results: int = 20
495
+ ) -> str:
496
+ """
497
+ Low-Call approach:
498
+ 1) Tavily search (up to 20 URLs).
499
+ 2) Firecrawl scrape => combined text
500
+ 3) Truncate to 12k tokens total
501
+ 4) Split => at most 2 x 6k chunks => Summarize each chunk once => summaries
502
+ 5) Single final merge => final PDF
503
+ => 2 or 3 total LLM calls => drastically fewer calls => less chance of 429
504
+
505
+ Logs at each step for clarity.
506
+ """
507
+ print(f"[LOG] Starting LOW-CALL research agent for topic: {topic}")
508
+
509
+ try:
510
+ # Step 1: Tavily search
511
+ print("[LOG] Step 1: Searching with Tavily for relevant URLs (max_results=20).")
512
+ tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API_KEY"))
513
+ search_data = tavily_client.search(query=topic, max_results=max_results)
514
+ search_results = search_data.get("results", [])
515
+ print(f"[LOG] Tavily provided {len(search_results)} results. Proceeding to Step 2.")
516
+ if not search_results:
517
+ print("[LOG] No relevant search results found by Tavily.")
518
+ return "No relevant search results found."
519
+
520
+ references_list = [r["url"] for r in search_results if "url" in r]
521
+
522
+ # Step 2: Firecrawl scraping
523
+ print("[LOG] Step 2: Scraping each URL with Firecrawl.")
524
+ combined_content = ""
525
+ for result in search_results:
526
+ url = result["url"]
527
+ print(f"[LOG] Firecrawl scraping: {url}")
528
+ headers = {'Authorization': f'Bearer {os.environ.get("FIRECRAWL_API_KEY")}'}
529
+ payload = {"url": url, "formats": ["markdown"], "onlyMainContent": True}
530
+ try:
531
+ resp = requests.post("https://api.firecrawl.dev/v1/scrape", headers=headers, json=payload)
532
+ resp.raise_for_status()
533
+ data = resp.json()
534
+ if data.get("success") and "markdown" in data.get("data", {}):
535
+ combined_content += data["data"]["markdown"] + "\n\n"
536
+ else:
537
+ print(f"[WARNING] Firecrawl scrape failed or no markdown for {url}: {data.get('error')}")
538
+ except requests.RequestException as e:
539
+ print(f"[ERROR] Firecrawl error for {url}: {e}")
540
+ continue
541
+
542
+ if not combined_content:
543
+ print("[LOG] Could not retrieve content from any search results. Exiting.")
544
+ return "Could not retrieve content from any of the search results."
545
+
546
+ # Step 3: Truncate to 12k tokens total
547
+ print("[LOG] Step 3: Truncating combined text to 12,000 tokens if needed.")
548
+ combined_content = truncate_text_tokens(combined_content, max_tokens=12000)
549
+
550
+ # Step 4: At most 2 chunks => Summaries
551
+ print("[LOG] Step 4: Splitting text into up to 2 chunks (6,000 tokens each). Summarizing each chunk.")
552
+ tokenizer = tiktoken.get_encoding("cl100k_base")
553
+ tokens = tokenizer.encode(combined_content)
554
+ chunk_size = 6000
555
+
556
+ groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
557
+ summaries = []
558
+ start = 0
559
+ chunk_index = 1
560
+ while start < len(tokens):
561
+ end = min(start + chunk_size, len(tokens))
562
+ chunk_text = tokenizer.decode(tokens[start:end])
563
+ print(f"[LOG] Summarizing chunk {chunk_index} with ~{len(tokens[start:end])} tokens.")
564
+ prompt = f"""
565
+ You are a specialized summarization engine. Summarize the following text
566
+ for a professional research report. Provide accurate details but do not
567
+ include chain-of-thought or internal reasoning. Keep it concise, but
568
+ include key data points and context:
569
+
570
+ {chunk_text}
571
+ """
572
+ data = {
573
+ "model": MODEL_SUMMARIZATION,
574
+ "messages": [{"role": "user", "content": prompt}],
575
+ "temperature": 0.2,
576
+ "max_tokens": 768
577
+ }
578
+ response = call_llm_with_retry(groq_client, **data)
579
+ summary_text = response.choices[0].message.content.strip()
580
+ summaries.append(summary_text)
581
+
582
+ start = end
583
+ chunk_index += 1
584
+ # Because chunk_size=6000, only 2 chunks max
585
+ if chunk_index > 2:
586
+ break
587
+
588
+ # Step 5: Single final merge call
589
+ print("[LOG] Step 5: Doing one final merge of chunk summaries.")
590
+ references_text = "\n".join(f"- {url}" for url in references_list) if references_list else "None"
591
+ truncated_summaries = [truncate_text_for_llm(s, max_tokens=1000) for s in summaries]
592
+ merged_input = "\n\n".join(truncated_summaries)
593
+
594
+ final_prompt = f"""
595
+ IMPORTANT: Do NOT include chain-of-thought or hidden planning.
596
+ Produce a long, academic-style research paper with the following structure:
597
+ - Title Page (concise descriptive title)
598
+ - Table of Contents
599
+ - Executive Summary
600
+ - Introduction
601
+ - Historical or Contextual Background
602
+ - Multiple Thematic Sections (with subheadings)
603
+ - Detailed Analysis (multi-paragraph sections)
604
+ - Footnotes or inline citations referencing the URLs
605
+ - Conclusion
606
+ - References / Bibliography (list these URLs at the end)
607
+
608
+ Requirements:
609
+ - Minimal bullet points, prefer multi-paragraph
610
+ - Each section at least 2-3 paragraphs
611
+ - Aim for 1500+ words if possible
612
+ - Under 6000 tokens total
613
+ - Professional, academic tone
614
+
615
+ Partial Summaries:
616
+ {merged_input}
617
+
618
+ References (URLs):
619
+ {references_text}
620
+
621
+ Now, merge these partial summaries into one thoroughly expanded research paper:
622
+ """
623
+ final_data = {
624
+ "model": MODEL_COMBINATION,
625
+ "messages": [{"role": "user", "content": final_prompt}],
626
+ "temperature": 0.3,
627
+ "max_tokens": 2048
628
+ }
629
+ final_response = call_llm_with_retry(groq_client, **final_data)
630
+ final_text = final_response.choices[0].message.content.strip()
631
+
632
+ # Step 6: PDF generation
633
+ print("[LOG] Step 6: Generating final PDF from the merged text.")
634
+ final_report = generate_report(final_text)
635
+
636
+ print("[LOG] Done! Returning PDF from run_research_agent (low-call).")
637
+ return final_report
638
+
639
+ except Exception as e:
640
+ print(f"[ERROR] Error in run_research_agent: {e}")
641
+ return f"Sorry, encountered an error: {str(e)}"