MyPod_10

Running

App Files Files Community

siddhartharyaai commited on Jan 6

Commit

a2537a4

verified ·

1 Parent(s): b98da19

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -47

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import os
 import tempfile
 import pypdf
 from pydub import AudioSegment
 from utils import (
     generate_script,
@@ -28,15 +29,16 @@ def parse_user_edited_transcript(edited_text: str):
     pattern = r"\*\*(Jane|John)\*\*:\s*(.+)"
     matches = re.findall(pattern, edited_text)
     if not matches:
-        # If user changed the format drastically, treat entire text as Jane
         return [("Jane", edited_text)]
     return matches
 def regenerate_audio_from_dialogue(dialogue_items):
     """
-    Re-generates multi-speaker audio from user-edited text.
     Returns final audio bytes and updated transcript.
     """
     audio_segments = []
     transcript = ""
     crossfade_duration = 50  # in ms
@@ -51,16 +53,19 @@ def regenerate_audio_from_dialogue(dialogue_items):
     if not audio_segments:
         return None, "No audio segments were generated."
-    # Combine with crossfade
-    combined = audio_segments[0]
     for seg in audio_segments[1:]:
-        combined = combined.append(seg, crossfade=crossfade_duration)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
-        combined.export(temp_audio.name, format="mp3")
         final_mp3_path = temp_audio.name
-    # Read bytes and return them (so we have a real .mp3 to download)
     with open(final_mp3_path, "rb") as f:
         audio_bytes = f.read()
     os.remove(final_mp3_path)
@@ -69,12 +74,8 @@ def regenerate_audio_from_dialogue(dialogue_items):
 def generate_podcast(file, url, video_url, research_topic_input, tone, length):
     """
-    Creates a multi-speaker podcast from:
-      - PDF
-      - URL
-      - YouTube video
-      - or a research topic input.
-    Returns (audio_bytes, transcript_str).
     """
     sources = [bool(file), bool(url), bool(video_url), bool(research_topic_input)]
     if sum(sources) > 1:
@@ -82,9 +83,9 @@ def generate_podcast(file, url, video_url, research_topic_input, tone, length):
     if not any(sources):
         return None, "Please provide at least one source."
     text = ""
     if file:
-        # Handle PDF
         try:
             if not file.name.lower().endswith('.pdf'):
                 return None, "Please upload a PDF file."
@@ -93,7 +94,6 @@ def generate_podcast(file, url, video_url, research_topic_input, tone, length):
         except Exception as e:
             return None, f"Error reading PDF: {str(e)}"
     elif url:
-        # Handle URL
         try:
             text = extract_text_from_url(url)
             if not text:
@@ -101,7 +101,6 @@ def generate_podcast(file, url, video_url, research_topic_input, tone, length):
         except Exception as e:
             return None, f"Error extracting text from URL: {str(e)}"
     elif video_url:
-        # Handle YouTube
         try:
             text = transcribe_youtube_video(video_url)
             if not text:
@@ -109,7 +108,6 @@ def generate_podcast(file, url, video_url, research_topic_input, tone, length):
         except Exception as e:
             return None, f"Error transcribing YouTube video: {str(e)}"
     elif research_topic_input:
-        # Handle research topic
         try:
             text = research_topic(research_topic_input)
             if not text:
@@ -117,13 +115,14 @@ def generate_podcast(file, url, video_url, research_topic_input, tone, length):
         except Exception as e:
             return None, f"Error researching topic: {str(e)}"
-    # Generate the multi-speaker script
     try:
         text = truncate_text(text)
         script = generate_script(SYSTEM_PROMPT, text, tone, length)
     except Exception as e:
         return None, f"Error generating script: {str(e)}"
     audio_segments = []
     transcript = ""
     crossfade_duration = 50  # ms
@@ -139,31 +138,67 @@ def generate_podcast(file, url, video_url, research_topic_input, tone, length):
         if not audio_segments:
             return None, "No audio segments generated."
-        combined = audio_segments[0]
         for seg in audio_segments[1:]:
-            combined = combined.append(seg, crossfade=crossfade_duration)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
-            combined.export(temp_audio.name, format="mp3")
             final_mp3_path = temp_audio.name
-        # Convert final mp3 to bytes
         with open(final_mp3_path, "rb") as f:
             audio_bytes = f.read()
         os.remove(final_mp3_path)
         return audio_bytes, transcript
     except Exception as e:
         return None, f"Error generating audio: {str(e)}"
 def main():
-    # Moved set_page_config to the very top of all Streamlit commands
-    st.set_page_config(
-        page_title="MyPod - AI-based Podcast Generator",
-        layout="centered"
-    )
-    # Enable "light or dark" theme via custom CSS
     st.markdown(
         """
         <style>
@@ -197,19 +232,14 @@ def main():
         "MyPod transforms your documents, webpages, YouTube videos, or research topics into a more human-sounding, conversational podcast.\n"
         "Select a tone and a duration range. The script will be on-topic, concise, and respect your chosen length.\n\n"
         "### How to use:\n"
-        "1. **Provide one source:** PDF, URL, YouTube link (Requires User Auth - Work in Progress), or a Topic to Research.\n"
         "2. **Choose the tone and the target duration.**\n"
         "3. **Click 'Generate Podcast'** to produce your podcast.\n\n"
         "**After** the audio is generated, you can **edit** the transcript \n"
         "and **re-generate** the audio with your edits if needed.\n\n"
-        "**Research a Topic:** Please be as detailed as possible in your topic statement. If it's too niche or specific, "
-        "you might not get the desired outcome. We'll fetch information from Wikipedia and RSS feeds (BBC, CNN, Associated Press, "
-        "NDTV, Times of India, The Hindu, Economic Times, Google News) or the LLM knowledge base to get recent info about the topic.\n\n"
-        "**Token Limit:** Up to ~2,048 tokens are supported. Long inputs may be truncated.\n"
-        "**Note:** YouTube transcription uses Whisper on CPU and may take longer for very long videos.\n\n"
-        "⏳**Please be patient while your podcast is being generated.** This process involves content analysis, script creation, "
         "and high-quality audio synthesis, which may take a few minutes.\n\n"
-        "🔥 **Ready to create your personalized podcast?** Give it a try now and let the magic happen! 🔥"
     )
     col1, col2 = st.columns(2)
@@ -222,7 +252,6 @@ def main():
         tone = st.radio("Tone", ["Humorous", "Formal", "Casual", "Youthful"], index=2)
         length = st.radio("Length", ["1-3 Mins", "3-5 Mins", "5-10 Mins", "10-20 Mins"], index=0)
-    # Use session_state to avoid losing results if user clicks away
     if "audio_bytes" not in st.session_state:
         st.session_state["audio_bytes"] = None
     if "transcript" not in st.session_state:
@@ -231,11 +260,9 @@ def main():
     generate_button = st.button("Generate Podcast")
     if generate_button:
-        # Show a pseudo progress bar for user engagement
         progress_bar = st.progress(0)
         progress_text = st.empty()
-        # Steps to pretend some progress:
         progress_text.write("Alright, let's get started...")
         progress_bar.progress(10)
         time.sleep(1.0)
@@ -258,7 +285,6 @@ def main():
         if audio_bytes is None:
             st.error(transcript)
-            # Clear session state
             st.session_state["audio_bytes"] = None
             st.session_state["transcript"] = None
         else:
@@ -266,11 +292,8 @@ def main():
             st.session_state["audio_bytes"] = audio_bytes
             st.session_state["transcript"] = transcript
-    # Check if we have a stored result
     if st.session_state["audio_bytes"]:
-        # Show the audio
         st.audio(st.session_state["audio_bytes"], format='audio/mp3')
-        # Provide a download button with .mp3 extension
         st.download_button(
             label="Download Podcast (MP3)",
             data=st.session_state["audio_bytes"],
@@ -278,7 +301,6 @@ def main():
             mime="audio/mpeg"
         )
-        # Show the transcript in a text area for editing
         st.markdown("### Generated Transcript (Editable)")
         edited_text = st.text_area(
             "Feel free to tweak lines, fix errors, or reword anything.",
@@ -286,7 +308,6 @@ def main():
             height=300
         )
-        # Regenerate button
         if st.button("Regenerate Audio From Edited Text"):
             regen_bar = st.progress(0)
             regen_text = st.empty()
@@ -299,7 +320,6 @@ def main():
             regen_bar.progress(60)
             time.sleep(1.0)
-            # Parse & regenerate
             dialogue_items = parse_user_edited_transcript(edited_text)
             new_audio_bytes, new_transcript = regenerate_audio_from_dialogue(dialogue_items)
@@ -314,7 +334,6 @@ def main():
                 regen_text.write("All set!")
                 st.success("Regenerated audio below:")
-                # Store updated
                 st.session_state["audio_bytes"] = new_audio_bytes
                 st.session_state["transcript"] = new_transcript

 import tempfile
 import pypdf
 from pydub import AudioSegment
+from pydub import effects  # for normalizing volume if needed
 from utils import (
     generate_script,
     pattern = r"\*\*(Jane|John)\*\*:\s*(.+)"
     matches = re.findall(pattern, edited_text)
     if not matches:
         return [("Jane", edited_text)]
     return matches
 def regenerate_audio_from_dialogue(dialogue_items):
     """
+    Re-generates multi-speaker audio from user-edited text,
+    then mixes with background music from the root folder (bg_music.mp3).
     Returns final audio bytes and updated transcript.
     """
+    # 1) Create spoken segments
     audio_segments = []
     transcript = ""
     crossfade_duration = 50  # in ms
     if not audio_segments:
         return None, "No audio segments were generated."
+    # 2) Combine spoken segments
+    combined_spoken = audio_segments[0]
     for seg in audio_segments[1:]:
+        combined_spoken = combined_spoken.append(seg, crossfade=crossfade_duration)
+    # 3) Mix with background music
+    final_mix = mix_with_bg_music(combined_spoken)
+    # 4) Export to bytes
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
+        final_mix.export(temp_audio.name, format="mp3")
         final_mp3_path = temp_audio.name
     with open(final_mp3_path, "rb") as f:
         audio_bytes = f.read()
     os.remove(final_mp3_path)
 def generate_podcast(file, url, video_url, research_topic_input, tone, length):
     """
+    Creates a multi-speaker podcast from PDF, URL, YouTube, or a research topic.
+    Returns (audio_bytes, transcript_str), mixing with background music in root folder (bg_music.mp3).
     """
     sources = [bool(file), bool(url), bool(video_url), bool(research_topic_input)]
     if sum(sources) > 1:
     if not any(sources):
         return None, "Please provide at least one source."
+    # 1) Fetch text
     text = ""
     if file:
         try:
             if not file.name.lower().endswith('.pdf'):
                 return None, "Please upload a PDF file."
         except Exception as e:
             return None, f"Error reading PDF: {str(e)}"
     elif url:
         try:
             text = extract_text_from_url(url)
             if not text:
         except Exception as e:
             return None, f"Error extracting text from URL: {str(e)}"
     elif video_url:
         try:
             text = transcribe_youtube_video(video_url)
             if not text:
         except Exception as e:
             return None, f"Error transcribing YouTube video: {str(e)}"
     elif research_topic_input:
         try:
             text = research_topic(research_topic_input)
             if not text:
         except Exception as e:
             return None, f"Error researching topic: {str(e)}"
+    # 2) Generate multi-speaker script
     try:
         text = truncate_text(text)
         script = generate_script(SYSTEM_PROMPT, text, tone, length)
     except Exception as e:
         return None, f"Error generating script: {str(e)}"
+    # 3) Convert dialogue to spoken segments
     audio_segments = []
     transcript = ""
     crossfade_duration = 50  # ms
         if not audio_segments:
             return None, "No audio segments generated."
+        # Combine
+        combined_spoken = audio_segments[0]
         for seg in audio_segments[1:]:
+            combined_spoken = combined_spoken.append(seg, crossfade=crossfade_duration)
+        # Mix with background music
+        final_mix = mix_with_bg_music(combined_spoken)
+        # Export to bytes
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
+            final_mix.export(temp_audio.name, format="mp3")
             final_mp3_path = temp_audio.name
         with open(final_mp3_path, "rb") as f:
             audio_bytes = f.read()
         os.remove(final_mp3_path)
         return audio_bytes, transcript
     except Exception as e:
         return None, f"Error generating audio: {str(e)}"
+def mix_with_bg_music(spoken: AudioSegment) -> AudioSegment:
+    """
+    Mixes 'spoken' with bg_music.mp3 in the root folder:
+    1) Start with 2 seconds of music alone before speech begins.
+    2) Loop the music if it's shorter than the final audio length.
+    3) Lower the music volume so the speech is clear.
+    """
+    # Path to background music in root folder:
+    bg_music_path = "bg_music.mp3"  # root-level file
+    try:
+        bg_music = AudioSegment.from_file(bg_music_path, format="mp3")
+    except Exception as e:
+        print("[ERROR] Failed to load background music:", e)
+        return spoken
+    # Lower the music volume (e.g. -14 dB)
+    bg_music = bg_music - 14.0
+    # total_length_ms = spoken length + 2000ms intro
+    total_length_ms = len(spoken) + 2000
+    # Loop the music if it's shorter than total_length_ms
+    looped_music = AudioSegment.empty()
+    while len(looped_music) < total_length_ms:
+        looped_music += bg_music
+    # Crop to exact total_length_ms
+    looped_music = looped_music[:total_length_ms]
+    # Create 2s intro for music before speech
+    final_mix = looped_music.overlay(spoken, position=2000)
+    return final_mix
 def main():
+    # Move set_page_config to the top if needed
+    st.set_page_config(page_title="MyPod - AI-based Podcast Generator", layout="centered")
     st.markdown(
         """
         <style>
         "MyPod transforms your documents, webpages, YouTube videos, or research topics into a more human-sounding, conversational podcast.\n"
         "Select a tone and a duration range. The script will be on-topic, concise, and respect your chosen length.\n\n"
         "### How to use:\n"
+        "1. **Provide one source:** PDF, URL, YouTube link, or a Topic to Research.\n"
         "2. **Choose the tone and the target duration.**\n"
         "3. **Click 'Generate Podcast'** to produce your podcast.\n\n"
         "**After** the audio is generated, you can **edit** the transcript \n"
         "and **re-generate** the audio with your edits if needed.\n\n"
+        "⏳**Please be patient while your podcast is being generated.** It involves content analysis, script creation, "
         "and high-quality audio synthesis, which may take a few minutes.\n\n"
+        "🔥 **Ready to create your personalized podcast?** Give it a try now!"
     )
     col1, col2 = st.columns(2)
         tone = st.radio("Tone", ["Humorous", "Formal", "Casual", "Youthful"], index=2)
         length = st.radio("Length", ["1-3 Mins", "3-5 Mins", "5-10 Mins", "10-20 Mins"], index=0)
     if "audio_bytes" not in st.session_state:
         st.session_state["audio_bytes"] = None
     if "transcript" not in st.session_state:
     generate_button = st.button("Generate Podcast")
     if generate_button:
         progress_bar = st.progress(0)
         progress_text = st.empty()
         progress_text.write("Alright, let's get started...")
         progress_bar.progress(10)
         time.sleep(1.0)
         if audio_bytes is None:
             st.error(transcript)
             st.session_state["audio_bytes"] = None
             st.session_state["transcript"] = None
         else:
             st.session_state["audio_bytes"] = audio_bytes
             st.session_state["transcript"] = transcript
     if st.session_state["audio_bytes"]:
         st.audio(st.session_state["audio_bytes"], format='audio/mp3')
         st.download_button(
             label="Download Podcast (MP3)",
             data=st.session_state["audio_bytes"],
             mime="audio/mpeg"
         )
         st.markdown("### Generated Transcript (Editable)")
         edited_text = st.text_area(
             "Feel free to tweak lines, fix errors, or reword anything.",
             height=300
         )
         if st.button("Regenerate Audio From Edited Text"):
             regen_bar = st.progress(0)
             regen_text = st.empty()
             regen_bar.progress(60)
             time.sleep(1.0)
             dialogue_items = parse_user_edited_transcript(edited_text)
             new_audio_bytes, new_transcript = regenerate_audio_from_dialogue(dialogue_items)
                 regen_text.write("All set!")
                 st.success("Regenerated audio below:")
                 st.session_state["audio_bytes"] = new_audio_bytes
                 st.session_state["transcript"] = new_transcript