siddhartharyaai commited on
Commit
a2537a4
·
verified ·
1 Parent(s): b98da19

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -47
app.py CHANGED
@@ -7,6 +7,7 @@ import os
7
  import tempfile
8
  import pypdf
9
  from pydub import AudioSegment
 
10
 
11
  from utils import (
12
  generate_script,
@@ -28,15 +29,16 @@ def parse_user_edited_transcript(edited_text: str):
28
  pattern = r"\*\*(Jane|John)\*\*:\s*(.+)"
29
  matches = re.findall(pattern, edited_text)
30
  if not matches:
31
- # If user changed the format drastically, treat entire text as Jane
32
  return [("Jane", edited_text)]
33
  return matches
34
 
35
  def regenerate_audio_from_dialogue(dialogue_items):
36
  """
37
- Re-generates multi-speaker audio from user-edited text.
 
38
  Returns final audio bytes and updated transcript.
39
  """
 
40
  audio_segments = []
41
  transcript = ""
42
  crossfade_duration = 50 # in ms
@@ -51,16 +53,19 @@ def regenerate_audio_from_dialogue(dialogue_items):
51
  if not audio_segments:
52
  return None, "No audio segments were generated."
53
 
54
- # Combine with crossfade
55
- combined = audio_segments[0]
56
  for seg in audio_segments[1:]:
57
- combined = combined.append(seg, crossfade=crossfade_duration)
58
 
 
 
 
 
59
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
60
- combined.export(temp_audio.name, format="mp3")
61
  final_mp3_path = temp_audio.name
62
 
63
- # Read bytes and return them (so we have a real .mp3 to download)
64
  with open(final_mp3_path, "rb") as f:
65
  audio_bytes = f.read()
66
  os.remove(final_mp3_path)
@@ -69,12 +74,8 @@ def regenerate_audio_from_dialogue(dialogue_items):
69
 
70
  def generate_podcast(file, url, video_url, research_topic_input, tone, length):
71
  """
72
- Creates a multi-speaker podcast from:
73
- - PDF
74
- - URL
75
- - YouTube video
76
- - or a research topic input.
77
- Returns (audio_bytes, transcript_str).
78
  """
79
  sources = [bool(file), bool(url), bool(video_url), bool(research_topic_input)]
80
  if sum(sources) > 1:
@@ -82,9 +83,9 @@ def generate_podcast(file, url, video_url, research_topic_input, tone, length):
82
  if not any(sources):
83
  return None, "Please provide at least one source."
84
 
 
85
  text = ""
86
  if file:
87
- # Handle PDF
88
  try:
89
  if not file.name.lower().endswith('.pdf'):
90
  return None, "Please upload a PDF file."
@@ -93,7 +94,6 @@ def generate_podcast(file, url, video_url, research_topic_input, tone, length):
93
  except Exception as e:
94
  return None, f"Error reading PDF: {str(e)}"
95
  elif url:
96
- # Handle URL
97
  try:
98
  text = extract_text_from_url(url)
99
  if not text:
@@ -101,7 +101,6 @@ def generate_podcast(file, url, video_url, research_topic_input, tone, length):
101
  except Exception as e:
102
  return None, f"Error extracting text from URL: {str(e)}"
103
  elif video_url:
104
- # Handle YouTube
105
  try:
106
  text = transcribe_youtube_video(video_url)
107
  if not text:
@@ -109,7 +108,6 @@ def generate_podcast(file, url, video_url, research_topic_input, tone, length):
109
  except Exception as e:
110
  return None, f"Error transcribing YouTube video: {str(e)}"
111
  elif research_topic_input:
112
- # Handle research topic
113
  try:
114
  text = research_topic(research_topic_input)
115
  if not text:
@@ -117,13 +115,14 @@ def generate_podcast(file, url, video_url, research_topic_input, tone, length):
117
  except Exception as e:
118
  return None, f"Error researching topic: {str(e)}"
119
 
120
- # Generate the multi-speaker script
121
  try:
122
  text = truncate_text(text)
123
  script = generate_script(SYSTEM_PROMPT, text, tone, length)
124
  except Exception as e:
125
  return None, f"Error generating script: {str(e)}"
126
 
 
127
  audio_segments = []
128
  transcript = ""
129
  crossfade_duration = 50 # ms
@@ -139,31 +138,67 @@ def generate_podcast(file, url, video_url, research_topic_input, tone, length):
139
  if not audio_segments:
140
  return None, "No audio segments generated."
141
 
142
- combined = audio_segments[0]
 
143
  for seg in audio_segments[1:]:
144
- combined = combined.append(seg, crossfade=crossfade_duration)
 
 
 
145
 
 
146
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
147
- combined.export(temp_audio.name, format="mp3")
148
  final_mp3_path = temp_audio.name
149
 
150
- # Convert final mp3 to bytes
151
  with open(final_mp3_path, "rb") as f:
152
  audio_bytes = f.read()
153
  os.remove(final_mp3_path)
154
 
155
  return audio_bytes, transcript
 
156
  except Exception as e:
157
  return None, f"Error generating audio: {str(e)}"
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  def main():
160
- # Moved set_page_config to the very top of all Streamlit commands
161
- st.set_page_config(
162
- page_title="MyPod - AI-based Podcast Generator",
163
- layout="centered"
164
- )
165
 
166
- # Enable "light or dark" theme via custom CSS
167
  st.markdown(
168
  """
169
  <style>
@@ -197,19 +232,14 @@ def main():
197
  "MyPod transforms your documents, webpages, YouTube videos, or research topics into a more human-sounding, conversational podcast.\n"
198
  "Select a tone and a duration range. The script will be on-topic, concise, and respect your chosen length.\n\n"
199
  "### How to use:\n"
200
- "1. **Provide one source:** PDF, URL, YouTube link (Requires User Auth - Work in Progress), or a Topic to Research.\n"
201
  "2. **Choose the tone and the target duration.**\n"
202
  "3. **Click 'Generate Podcast'** to produce your podcast.\n\n"
203
  "**After** the audio is generated, you can **edit** the transcript \n"
204
  "and **re-generate** the audio with your edits if needed.\n\n"
205
- "**Research a Topic:** Please be as detailed as possible in your topic statement. If it's too niche or specific, "
206
- "you might not get the desired outcome. We'll fetch information from Wikipedia and RSS feeds (BBC, CNN, Associated Press, "
207
- "NDTV, Times of India, The Hindu, Economic Times, Google News) or the LLM knowledge base to get recent info about the topic.\n\n"
208
- "**Token Limit:** Up to ~2,048 tokens are supported. Long inputs may be truncated.\n"
209
- "**Note:** YouTube transcription uses Whisper on CPU and may take longer for very long videos.\n\n"
210
- "⏳**Please be patient while your podcast is being generated.** This process involves content analysis, script creation, "
211
  "and high-quality audio synthesis, which may take a few minutes.\n\n"
212
- "🔥 **Ready to create your personalized podcast?** Give it a try now and let the magic happen! 🔥"
213
  )
214
 
215
  col1, col2 = st.columns(2)
@@ -222,7 +252,6 @@ def main():
222
  tone = st.radio("Tone", ["Humorous", "Formal", "Casual", "Youthful"], index=2)
223
  length = st.radio("Length", ["1-3 Mins", "3-5 Mins", "5-10 Mins", "10-20 Mins"], index=0)
224
 
225
- # Use session_state to avoid losing results if user clicks away
226
  if "audio_bytes" not in st.session_state:
227
  st.session_state["audio_bytes"] = None
228
  if "transcript" not in st.session_state:
@@ -231,11 +260,9 @@ def main():
231
  generate_button = st.button("Generate Podcast")
232
 
233
  if generate_button:
234
- # Show a pseudo progress bar for user engagement
235
  progress_bar = st.progress(0)
236
  progress_text = st.empty()
237
 
238
- # Steps to pretend some progress:
239
  progress_text.write("Alright, let's get started...")
240
  progress_bar.progress(10)
241
  time.sleep(1.0)
@@ -258,7 +285,6 @@ def main():
258
 
259
  if audio_bytes is None:
260
  st.error(transcript)
261
- # Clear session state
262
  st.session_state["audio_bytes"] = None
263
  st.session_state["transcript"] = None
264
  else:
@@ -266,11 +292,8 @@ def main():
266
  st.session_state["audio_bytes"] = audio_bytes
267
  st.session_state["transcript"] = transcript
268
 
269
- # Check if we have a stored result
270
  if st.session_state["audio_bytes"]:
271
- # Show the audio
272
  st.audio(st.session_state["audio_bytes"], format='audio/mp3')
273
- # Provide a download button with .mp3 extension
274
  st.download_button(
275
  label="Download Podcast (MP3)",
276
  data=st.session_state["audio_bytes"],
@@ -278,7 +301,6 @@ def main():
278
  mime="audio/mpeg"
279
  )
280
 
281
- # Show the transcript in a text area for editing
282
  st.markdown("### Generated Transcript (Editable)")
283
  edited_text = st.text_area(
284
  "Feel free to tweak lines, fix errors, or reword anything.",
@@ -286,7 +308,6 @@ def main():
286
  height=300
287
  )
288
 
289
- # Regenerate button
290
  if st.button("Regenerate Audio From Edited Text"):
291
  regen_bar = st.progress(0)
292
  regen_text = st.empty()
@@ -299,7 +320,6 @@ def main():
299
  regen_bar.progress(60)
300
  time.sleep(1.0)
301
 
302
- # Parse & regenerate
303
  dialogue_items = parse_user_edited_transcript(edited_text)
304
  new_audio_bytes, new_transcript = regenerate_audio_from_dialogue(dialogue_items)
305
 
@@ -314,7 +334,6 @@ def main():
314
  regen_text.write("All set!")
315
  st.success("Regenerated audio below:")
316
 
317
- # Store updated
318
  st.session_state["audio_bytes"] = new_audio_bytes
319
  st.session_state["transcript"] = new_transcript
320
 
 
7
  import tempfile
8
  import pypdf
9
  from pydub import AudioSegment
10
+ from pydub import effects # for normalizing volume if needed
11
 
12
  from utils import (
13
  generate_script,
 
29
  pattern = r"\*\*(Jane|John)\*\*:\s*(.+)"
30
  matches = re.findall(pattern, edited_text)
31
  if not matches:
 
32
  return [("Jane", edited_text)]
33
  return matches
34
 
35
  def regenerate_audio_from_dialogue(dialogue_items):
36
  """
37
+ Re-generates multi-speaker audio from user-edited text,
38
+ then mixes with background music from the root folder (bg_music.mp3).
39
  Returns final audio bytes and updated transcript.
40
  """
41
+ # 1) Create spoken segments
42
  audio_segments = []
43
  transcript = ""
44
  crossfade_duration = 50 # in ms
 
53
  if not audio_segments:
54
  return None, "No audio segments were generated."
55
 
56
+ # 2) Combine spoken segments
57
+ combined_spoken = audio_segments[0]
58
  for seg in audio_segments[1:]:
59
+ combined_spoken = combined_spoken.append(seg, crossfade=crossfade_duration)
60
 
61
+ # 3) Mix with background music
62
+ final_mix = mix_with_bg_music(combined_spoken)
63
+
64
+ # 4) Export to bytes
65
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
66
+ final_mix.export(temp_audio.name, format="mp3")
67
  final_mp3_path = temp_audio.name
68
 
 
69
  with open(final_mp3_path, "rb") as f:
70
  audio_bytes = f.read()
71
  os.remove(final_mp3_path)
 
74
 
75
  def generate_podcast(file, url, video_url, research_topic_input, tone, length):
76
  """
77
+ Creates a multi-speaker podcast from PDF, URL, YouTube, or a research topic.
78
+ Returns (audio_bytes, transcript_str), mixing with background music in root folder (bg_music.mp3).
 
 
 
 
79
  """
80
  sources = [bool(file), bool(url), bool(video_url), bool(research_topic_input)]
81
  if sum(sources) > 1:
 
83
  if not any(sources):
84
  return None, "Please provide at least one source."
85
 
86
+ # 1) Fetch text
87
  text = ""
88
  if file:
 
89
  try:
90
  if not file.name.lower().endswith('.pdf'):
91
  return None, "Please upload a PDF file."
 
94
  except Exception as e:
95
  return None, f"Error reading PDF: {str(e)}"
96
  elif url:
 
97
  try:
98
  text = extract_text_from_url(url)
99
  if not text:
 
101
  except Exception as e:
102
  return None, f"Error extracting text from URL: {str(e)}"
103
  elif video_url:
 
104
  try:
105
  text = transcribe_youtube_video(video_url)
106
  if not text:
 
108
  except Exception as e:
109
  return None, f"Error transcribing YouTube video: {str(e)}"
110
  elif research_topic_input:
 
111
  try:
112
  text = research_topic(research_topic_input)
113
  if not text:
 
115
  except Exception as e:
116
  return None, f"Error researching topic: {str(e)}"
117
 
118
+ # 2) Generate multi-speaker script
119
  try:
120
  text = truncate_text(text)
121
  script = generate_script(SYSTEM_PROMPT, text, tone, length)
122
  except Exception as e:
123
  return None, f"Error generating script: {str(e)}"
124
 
125
+ # 3) Convert dialogue to spoken segments
126
  audio_segments = []
127
  transcript = ""
128
  crossfade_duration = 50 # ms
 
138
  if not audio_segments:
139
  return None, "No audio segments generated."
140
 
141
+ # Combine
142
+ combined_spoken = audio_segments[0]
143
  for seg in audio_segments[1:]:
144
+ combined_spoken = combined_spoken.append(seg, crossfade=crossfade_duration)
145
+
146
+ # Mix with background music
147
+ final_mix = mix_with_bg_music(combined_spoken)
148
 
149
+ # Export to bytes
150
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
151
+ final_mix.export(temp_audio.name, format="mp3")
152
  final_mp3_path = temp_audio.name
153
 
 
154
  with open(final_mp3_path, "rb") as f:
155
  audio_bytes = f.read()
156
  os.remove(final_mp3_path)
157
 
158
  return audio_bytes, transcript
159
+
160
  except Exception as e:
161
  return None, f"Error generating audio: {str(e)}"
162
 
163
+ def mix_with_bg_music(spoken: AudioSegment) -> AudioSegment:
164
+ """
165
+ Mixes 'spoken' with bg_music.mp3 in the root folder:
166
+ 1) Start with 2 seconds of music alone before speech begins.
167
+ 2) Loop the music if it's shorter than the final audio length.
168
+ 3) Lower the music volume so the speech is clear.
169
+ """
170
+ # Path to background music in root folder:
171
+ bg_music_path = "bg_music.mp3" # root-level file
172
+
173
+ try:
174
+ bg_music = AudioSegment.from_file(bg_music_path, format="mp3")
175
+ except Exception as e:
176
+ print("[ERROR] Failed to load background music:", e)
177
+ return spoken
178
+
179
+ # Lower the music volume (e.g. -14 dB)
180
+ bg_music = bg_music - 14.0
181
+
182
+ # total_length_ms = spoken length + 2000ms intro
183
+ total_length_ms = len(spoken) + 2000
184
+
185
+ # Loop the music if it's shorter than total_length_ms
186
+ looped_music = AudioSegment.empty()
187
+ while len(looped_music) < total_length_ms:
188
+ looped_music += bg_music
189
+
190
+ # Crop to exact total_length_ms
191
+ looped_music = looped_music[:total_length_ms]
192
+
193
+ # Create 2s intro for music before speech
194
+ final_mix = looped_music.overlay(spoken, position=2000)
195
+
196
+ return final_mix
197
+
198
  def main():
199
+ # Move set_page_config to the top if needed
200
+ st.set_page_config(page_title="MyPod - AI-based Podcast Generator", layout="centered")
 
 
 
201
 
 
202
  st.markdown(
203
  """
204
  <style>
 
232
  "MyPod transforms your documents, webpages, YouTube videos, or research topics into a more human-sounding, conversational podcast.\n"
233
  "Select a tone and a duration range. The script will be on-topic, concise, and respect your chosen length.\n\n"
234
  "### How to use:\n"
235
+ "1. **Provide one source:** PDF, URL, YouTube link, or a Topic to Research.\n"
236
  "2. **Choose the tone and the target duration.**\n"
237
  "3. **Click 'Generate Podcast'** to produce your podcast.\n\n"
238
  "**After** the audio is generated, you can **edit** the transcript \n"
239
  "and **re-generate** the audio with your edits if needed.\n\n"
240
+ "⏳**Please be patient while your podcast is being generated.** It involves content analysis, script creation, "
 
 
 
 
 
241
  "and high-quality audio synthesis, which may take a few minutes.\n\n"
242
+ "🔥 **Ready to create your personalized podcast?** Give it a try now!"
243
  )
244
 
245
  col1, col2 = st.columns(2)
 
252
  tone = st.radio("Tone", ["Humorous", "Formal", "Casual", "Youthful"], index=2)
253
  length = st.radio("Length", ["1-3 Mins", "3-5 Mins", "5-10 Mins", "10-20 Mins"], index=0)
254
 
 
255
  if "audio_bytes" not in st.session_state:
256
  st.session_state["audio_bytes"] = None
257
  if "transcript" not in st.session_state:
 
260
  generate_button = st.button("Generate Podcast")
261
 
262
  if generate_button:
 
263
  progress_bar = st.progress(0)
264
  progress_text = st.empty()
265
 
 
266
  progress_text.write("Alright, let's get started...")
267
  progress_bar.progress(10)
268
  time.sleep(1.0)
 
285
 
286
  if audio_bytes is None:
287
  st.error(transcript)
 
288
  st.session_state["audio_bytes"] = None
289
  st.session_state["transcript"] = None
290
  else:
 
292
  st.session_state["audio_bytes"] = audio_bytes
293
  st.session_state["transcript"] = transcript
294
 
 
295
  if st.session_state["audio_bytes"]:
 
296
  st.audio(st.session_state["audio_bytes"], format='audio/mp3')
 
297
  st.download_button(
298
  label="Download Podcast (MP3)",
299
  data=st.session_state["audio_bytes"],
 
301
  mime="audio/mpeg"
302
  )
303
 
 
304
  st.markdown("### Generated Transcript (Editable)")
305
  edited_text = st.text_area(
306
  "Feel free to tweak lines, fix errors, or reword anything.",
 
308
  height=300
309
  )
310
 
 
311
  if st.button("Regenerate Audio From Edited Text"):
312
  regen_bar = st.progress(0)
313
  regen_text = st.empty()
 
320
  regen_bar.progress(60)
321
  time.sleep(1.0)
322
 
 
323
  dialogue_items = parse_user_edited_transcript(edited_text)
324
  new_audio_bytes, new_transcript = regenerate_audio_from_dialogue(dialogue_items)
325
 
 
334
  regen_text.write("All set!")
335
  st.success("Regenerated audio below:")
336
 
 
337
  st.session_state["audio_bytes"] = new_audio_bytes
338
  st.session_state["transcript"] = new_transcript
339