siddhartharyaai commited on
Commit
337f622
Β·
verified Β·
1 Parent(s): 81ea4ea

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +436 -0
app.py ADDED
@@ -0,0 +1,436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ import streamlit as st
4
+ import time
5
+ import re
6
+ import os
7
+ import tempfile
8
+ import pypdf
9
+ from pydub import AudioSegment, effects
10
+ import difflib # For computing differences between texts
11
+
12
+ from utils import (
13
+ generate_script,
14
+ generate_audio_mp3,
15
+ truncate_text,
16
+ extract_text_from_url,
17
+ transcribe_youtube_video,
18
+ research_topic
19
+ )
20
+ from prompts import SYSTEM_PROMPT
21
+
22
+
23
+ def parse_user_edited_transcript(edited_text: str):
24
+ """
25
+ Looks for lines like:
26
+ **Jane**: Hello
27
+ **John**: Sure, I'd love to talk about that.
28
+ Returns a list of (speaker, text).
29
+ """
30
+ pattern = r"\*\*(Jane|John)\*\*:\s*(.+)"
31
+ matches = re.findall(pattern, edited_text)
32
+ if not matches:
33
+ return [("Jane", edited_text)]
34
+ return matches
35
+
36
+
37
+ def regenerate_audio_from_dialogue(dialogue_items):
38
+ """
39
+ Re-generates multi-speaker audio from user-edited text,
40
+ then mixes with background music in the root folder (bg_music.mp3).
41
+ Returns final audio bytes and updated transcript.
42
+ """
43
+ audio_segments = []
44
+ transcript = ""
45
+ crossfade_duration = 50 # in ms
46
+
47
+ for speaker, line_text in dialogue_items:
48
+ audio_file = generate_audio_mp3(line_text, speaker)
49
+ seg = AudioSegment.from_file(audio_file, format="mp3")
50
+ audio_segments.append(seg)
51
+ transcript += f"**{speaker}**: {line_text}\n\n"
52
+ os.remove(audio_file)
53
+
54
+ if not audio_segments:
55
+ return None, "No audio segments were generated."
56
+
57
+ # Combine spoken segments
58
+ combined_spoken = audio_segments[0]
59
+ for seg in audio_segments[1:]:
60
+ combined_spoken = combined_spoken.append(seg, crossfade=crossfade_duration)
61
+
62
+ # Mix with background music
63
+ final_mix = mix_with_bg_music(combined_spoken)
64
+
65
+ # Export to bytes
66
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
67
+ final_mix.export(temp_audio.name, format="mp3")
68
+ final_mp3_path = temp_audio.name
69
+
70
+ with open(final_mp3_path, "rb") as f:
71
+ audio_bytes = f.read()
72
+ os.remove(final_mp3_path)
73
+
74
+ return audio_bytes, transcript
75
+
76
+
77
+ def generate_podcast(file, url, video_url, research_topic_input, tone, length):
78
+ """
79
+ Creates a multi-speaker podcast from PDF, URL, YouTube, or a research topic.
80
+ Returns (audio_bytes, transcript_str), mixing with background music in root folder (bg_music.mp3).
81
+ """
82
+ sources = [bool(file), bool(url), bool(video_url), bool(research_topic_input)]
83
+ if sum(sources) > 1:
84
+ return None, "Provide only one input (PDF, URL, YouTube, or Research topic)."
85
+ if not any(sources):
86
+ return None, "Please provide at least one source."
87
+
88
+ text = ""
89
+ if file:
90
+ try:
91
+ if not file.name.lower().endswith('.pdf'):
92
+ return None, "Please upload a PDF file."
93
+ # Use the file-like object directly to read the PDF
94
+ reader = pypdf.PdfReader(file)
95
+ text = " ".join(page.extract_text() for page in reader.pages if page.extract_text())
96
+ except Exception as e:
97
+ return None, f"Error reading PDF: {str(e)}"
98
+ elif url:
99
+ try:
100
+ text = extract_text_from_url(url)
101
+ if not text:
102
+ return None, "Failed to extract text from URL."
103
+ except Exception as e:
104
+ return None, f"Error extracting text from URL: {str(e)}"
105
+ elif video_url:
106
+ try:
107
+ text = transcribe_youtube_video(video_url)
108
+ if not text:
109
+ return None, "Failed to transcribe YouTube video."
110
+ except Exception as e:
111
+ return None, f"Error transcribing YouTube video: {str(e)}"
112
+ elif research_topic_input:
113
+ try:
114
+ text = research_topic(research_topic_input)
115
+ if not text:
116
+ return None, f"Sorry, no information found on '{research_topic_input}'."
117
+ except Exception as e:
118
+ return None, f"Error researching topic: {str(e)}"
119
+
120
+ # Generate script
121
+ try:
122
+ text = truncate_text(text)
123
+ script = generate_script(SYSTEM_PROMPT, text, tone, length)
124
+ except Exception as e:
125
+ return None, f"Error generating script: {str(e)}"
126
+
127
+ audio_segments = []
128
+ transcript = ""
129
+ crossfade_duration = 50 # ms
130
+
131
+ try:
132
+ for item in script.dialogue:
133
+ audio_file = generate_audio_mp3(item.text, item.speaker)
134
+ seg = AudioSegment.from_file(audio_file, format="mp3")
135
+ audio_segments.append(seg)
136
+ transcript += f"**{item.speaker}**: {item.text}\n\n"
137
+ os.remove(audio_file)
138
+
139
+ if not audio_segments:
140
+ return None, "No audio segments generated."
141
+
142
+ combined_spoken = audio_segments[0]
143
+ for seg in audio_segments[1:]:
144
+ combined_spoken = combined_spoken.append(seg, crossfade=crossfade_duration)
145
+
146
+ # Mix with bg music
147
+ final_mix = mix_with_bg_music(combined_spoken)
148
+
149
+ # Export to bytes
150
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
151
+ final_mix.export(temp_audio.name, format="mp3")
152
+ final_mp3_path = temp_audio.name
153
+
154
+ with open(final_mp3_path, "rb") as f:
155
+ audio_bytes = f.read()
156
+ os.remove(final_mp3_path)
157
+
158
+ return audio_bytes, transcript
159
+
160
+ except Exception as e:
161
+ return None, f"Error generating audio: {str(e)}"
162
+
163
+
164
+ def mix_with_bg_music(spoken: AudioSegment) -> AudioSegment:
165
+ """
166
+ Mixes 'spoken' with bg_music.mp3 in the root folder:
167
+ 1) Start with 2 seconds of music alone before speech begins.
168
+ 2) Loop the music if it's shorter than the final audio length.
169
+ 3) Lower the music volume so the speech is clear.
170
+ """
171
+ bg_music_path = "bg_music.mp3" # in root folder
172
+
173
+ try:
174
+ bg_music = AudioSegment.from_file(bg_music_path, format="mp3")
175
+ except Exception as e:
176
+ print("[ERROR] Failed to load background music:", e)
177
+ return spoken
178
+
179
+ bg_music = bg_music - 14.0 # Lower volume (e.g. -14 dB)
180
+
181
+ total_length_ms = len(spoken) + 2000
182
+ looped_music = AudioSegment.empty()
183
+ while len(looped_music) < total_length_ms:
184
+ looped_music += bg_music
185
+
186
+ looped_music = looped_music[:total_length_ms]
187
+
188
+ # Overlay spoken at 2000ms so we get 2s of music first
189
+ final_mix = looped_music.overlay(spoken, position=2000)
190
+
191
+ return final_mix
192
+
193
+
194
+ def highlight_differences(original: str, edited: str) -> str:
195
+ """
196
+ Highlights the differences between the original and edited transcripts.
197
+ Added or modified words are wrapped in <span> tags with red color.
198
+ """
199
+ matcher = difflib.SequenceMatcher(None, original.split(), edited.split())
200
+ highlighted = []
201
+ for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
202
+ if opcode == 'equal':
203
+ # Unchanged words
204
+ highlighted.extend(original.split()[i1:i2])
205
+ elif opcode in ('replace', 'insert'):
206
+ # Added or replaced words - highlight in red
207
+ added_words = edited.split()[j1:j2]
208
+ highlighted.extend([f'<span style="color:red">{word}</span>' for word in added_words])
209
+ elif opcode == 'delete':
210
+ # Deleted words - optionally, can be shown differently
211
+ # For now, we'll ignore deletions in the highlighted transcript
212
+ pass
213
+ return ' '.join(highlighted)
214
+
215
+
216
+ def main():
217
+ st.set_page_config(page_title="MyPod - AI-based Podcast Generator", layout="centered")
218
+
219
+ # Use smaller font for the main header
220
+ st.markdown("## MyPod - AI powered Podcast Generator")
221
+
222
+ st.markdown(
223
+ "Welcome to **MyPod**, your go-to AI-powered podcast generator! πŸŽ‰\n\n"
224
+ "MyPod transforms your documents, webpages, YouTube videos, or research topics into a more human-sounding, conversational podcast.\n"
225
+ "Select a tone and a duration range. The script will be on-topic, concise, and respect your chosen length.\n\n"
226
+ "### How to use:\n"
227
+ "1. **Provide one source:** PDF Files, Website URL, YouTube link or a Topic to Research.\n"
228
+ "2. **Choose the tone and the target duration.**\n"
229
+ "3. **Click 'Generate Podcast'** to produce your podcast. After the audio is generated, you can edit the transcript and re-generate the audio with your edits if needed.\n\n"
230
+ "**Research a Topic:** Please be as detailed as possible in your topic statement. If it's too niche or specific, "
231
+ "you might not get the desired outcome. We'll fetch information from Wikipedia, News RSS feeds or the LLM knowledge base to get recent info about the topic.\n\n"
232
+ "**Token Limit:** Up to ~2,048 tokens are supported. Long inputs may be truncated.\n"
233
+ "**Note:** YouTube videos will only work if they have captions built in.\n\n"
234
+ "⏳**Please be patient while your podcast is being generated.** This process involves content analysis, script creation, "
235
+ "and high-quality audio synthesis, which may take a few minutes.\n\n"
236
+ "πŸ”₯ **Ready to create your personalized podcast?** Give it a try now and let the magic happen! πŸ”₯"
237
+ )
238
+
239
+ col1, col2 = st.columns(2)
240
+ with col1:
241
+ file = st.file_uploader("Upload File (.pdf only)", type=["pdf"])
242
+ url = st.text_input("Or Enter Website URL")
243
+ video_url = st.text_input("Or Enter YouTube Link (Captioned videos)")
244
+ with col2:
245
+ research_topic_input = st.text_input("Or Research a Topic")
246
+ tone = st.radio("Tone", ["Humorous", "Formal", "Casual", "Youthful"], index=2)
247
+ length = st.radio("Length", ["1-3 Mins", "3-5 Mins", "5-10 Mins", "10-20 Mins"], index=0)
248
+
249
+ # Store results in session_state
250
+ if "audio_bytes" not in st.session_state:
251
+ st.session_state["audio_bytes"] = None
252
+ if "transcript" not in st.session_state:
253
+ st.session_state["transcript"] = None
254
+ if "transcript_original" not in st.session_state:
255
+ st.session_state["transcript_original"] = None # Store original transcript
256
+
257
+ # Add only the "Generate Podcast" button, centered
258
+ generate_button = st.button("Generate Podcast")
259
+
260
+ if generate_button:
261
+ progress_bar = st.progress(0)
262
+ progress_text = st.empty()
263
+
264
+ # Define progress stages and messages
265
+ progress_messages = [
266
+ "πŸ” Analyzing your input...",
267
+ "πŸ“ Crafting the perfect script...",
268
+ "πŸŽ™οΈ Generating high-quality audio...",
269
+ "🎢 Adding the finishing touches..."
270
+ ]
271
+
272
+ # Initialize progress at 0%
273
+ progress_text.write(progress_messages[0])
274
+ progress_bar.progress(0)
275
+ time.sleep(1.0)
276
+
277
+ # Update to 25%
278
+ progress_text.write(progress_messages[1])
279
+ progress_bar.progress(25)
280
+ time.sleep(1.0)
281
+
282
+ # Update to 50%
283
+ progress_text.write(progress_messages[2])
284
+ progress_bar.progress(50)
285
+ time.sleep(1.0)
286
+
287
+ # Update to 75%
288
+ progress_text.write(progress_messages[3])
289
+ progress_bar.progress(75)
290
+ time.sleep(1.0)
291
+
292
+ # Finalize to 100%
293
+ audio_bytes, transcript = generate_podcast(
294
+ file, url, video_url, research_topic_input, tone, length
295
+ )
296
+
297
+ progress_bar.progress(100)
298
+ progress_text.write("βœ… Done!")
299
+
300
+ if audio_bytes is None:
301
+ st.error(transcript)
302
+ st.session_state["audio_bytes"] = None
303
+ st.session_state["transcript"] = None
304
+ st.session_state["transcript_original"] = None
305
+ else:
306
+ st.success("Podcast generated successfully!")
307
+ st.session_state["audio_bytes"] = audio_bytes
308
+ st.session_state["transcript"] = transcript
309
+ st.session_state["transcript_original"] = transcript # Store original transcript
310
+
311
+ if st.session_state["audio_bytes"]:
312
+ st.audio(st.session_state["audio_bytes"], format='audio/mp3')
313
+ st.download_button(
314
+ label="Download Podcast (MP3)",
315
+ data=st.session_state["audio_bytes"],
316
+ file_name="my_podcast.mp3",
317
+ mime="audio/mpeg"
318
+ )
319
+
320
+ st.markdown("### Generated Transcript (Editable)")
321
+
322
+ # Editable text area for transcript
323
+ edited_text = st.text_area(
324
+ "Feel free to tweak lines, fix errors, or reword anything.",
325
+ value=st.session_state["transcript"],
326
+ height=300
327
+ )
328
+
329
+ # Compute differences and highlight edited text
330
+ if st.session_state["transcript_original"]:
331
+ highlighted_transcript = highlight_differences(
332
+ st.session_state["transcript_original"],
333
+ edited_text
334
+ )
335
+
336
+ st.markdown("### **Edited Transcript Highlights**", unsafe_allow_html=True)
337
+ st.markdown(highlighted_transcript, unsafe_allow_html=True)
338
+
339
+ if st.button("Regenerate Audio From Edited Text"):
340
+ regen_bar = st.progress(0)
341
+ regen_text = st.empty()
342
+
343
+ regen_text.write("πŸ”„ Regenerating your podcast with the edits...")
344
+ regen_bar.progress(25)
345
+ time.sleep(1.0)
346
+
347
+ regen_text.write("πŸ”§ Adjusting the script based on your changes...")
348
+ regen_bar.progress(50)
349
+ time.sleep(1.0)
350
+
351
+ dialogue_items = parse_user_edited_transcript(edited_text)
352
+ new_audio_bytes, new_transcript = regenerate_audio_from_dialogue(dialogue_items)
353
+
354
+ regen_bar.progress(75)
355
+ time.sleep(1.0)
356
+
357
+ if new_audio_bytes is None:
358
+ regen_bar.progress(100)
359
+ st.error(new_transcript)
360
+ else:
361
+ regen_bar.progress(100)
362
+ regen_text.write("βœ… Regeneration complete!")
363
+ st.success("Regenerated audio below:")
364
+
365
+ st.session_state["audio_bytes"] = new_audio_bytes
366
+ st.session_state["transcript"] = new_transcript
367
+ st.session_state["transcript_original"] = new_transcript # Update original transcript
368
+
369
+ st.audio(new_audio_bytes, format='audio/mp3')
370
+ st.download_button(
371
+ label="Download Edited Podcast (MP3)",
372
+ data=new_audio_bytes,
373
+ file_name="my_podcast_edited.mp3",
374
+ mime="audio/mpeg"
375
+ )
376
+ st.markdown("### Updated Transcript")
377
+ st.markdown(new_transcript)
378
+
379
+
380
+ # ---------------------------------------------------------------------
381
+ # Function to mix with background music is same as before
382
+ # ---------------------------------------------------------------------
383
+ def mix_with_bg_music(spoken: AudioSegment) -> AudioSegment:
384
+ """
385
+ Mixes 'spoken' with bg_music.mp3 in the root folder:
386
+ 1) Start with 2 seconds of music alone before speech begins.
387
+ 2) Loop the music if it's shorter than the final audio length.
388
+ 3) Lower the music volume so the speech is clear.
389
+ """
390
+ bg_music_path = "bg_music.mp3" # in root folder
391
+
392
+ try:
393
+ bg_music = AudioSegment.from_file(bg_music_path, format="mp3")
394
+ except Exception as e:
395
+ print("[ERROR] Failed to load background music:", e)
396
+ return spoken
397
+
398
+ bg_music = bg_music - 14.0 # Lower volume (e.g. -14 dB)
399
+
400
+ total_length_ms = len(spoken) + 2000
401
+ looped_music = AudioSegment.empty()
402
+ while len(looped_music) < total_length_ms:
403
+ looped_music += bg_music
404
+
405
+ looped_music = looped_music[:total_length_ms]
406
+
407
+ # Overlay spoken at 2000ms so we get 2s of music first
408
+ final_mix = looped_music.overlay(spoken, position=2000)
409
+
410
+ return final_mix
411
+
412
+
413
+ def highlight_differences(original: str, edited: str) -> str:
414
+ """
415
+ Highlights the differences between the original and edited transcripts.
416
+ Added or modified words are wrapped in <span> tags with red color.
417
+ """
418
+ matcher = difflib.SequenceMatcher(None, original.split(), edited.split())
419
+ highlighted = []
420
+ for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
421
+ if opcode == 'equal':
422
+ # Unchanged words
423
+ highlighted.extend(original.split()[i1:i2])
424
+ elif opcode in ('replace', 'insert'):
425
+ # Added or replaced words - highlight in red
426
+ added_words = edited.split()[j1:j2]
427
+ highlighted.extend([f'<span style="color:red">{word}</span>' for word in added_words])
428
+ elif opcode == 'delete':
429
+ # Deleted words - optionally, can be shown differently
430
+ # For now, we'll ignore deletions in the highlighted transcript
431
+ pass
432
+ return ' '.join(highlighted)
433
+
434
+
435
+ if __name__ == "__main__":
436
+ main()