MyPod_10

Running

App Files Files Community

siddhartharyaai commited on Jan 13

Commit

117149e

verified ·

1 Parent(s): 2752da2

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -41

app.py CHANGED Viewed

@@ -17,32 +17,37 @@ from utils import (
     transcribe_youtube_video,
     research_topic,
     mix_with_bg_music,
-    DialogueItem  # so we can construct items
 )
 from prompts import SYSTEM_PROMPT
 def parse_user_edited_transcript(edited_text: str, host_name: str, guest_name: str):
     """
     Looks for lines like:
         **Angela**: Hello
         **Dimitris**: Great topic...
     We treat 'Angela' as the raw display_speaker, 'Hello' as text.
-    Then we map 'Angela' -> speaker='Jane' if it matches host_name (case-insensitive),
-    'Dimitris' -> speaker='John' if it matches guest_name, else default to 'Jane'.
-    Returns a list of (DialogueItem).
     """
     pattern = r"\*\*(.+?)\*\*:\s*(.+)"
     matches = re.findall(pattern, edited_text)
     items = []
     if not matches:
-        # No lines found, treat entire text as if it's host
         raw_name = host_name or "Jane"
         text_line = edited_text.strip()
         speaker = "Jane"
         if raw_name.lower() == guest_name.lower():
             speaker = "John"
-        # build a single item
         item = DialogueItem(
             speaker=speaker,
             display_speaker=raw_name,
@@ -51,17 +56,15 @@ def parse_user_edited_transcript(edited_text: str, host_name: str, guest_name: s
         items.append(item)
         return items
-    # If we have multiple lines
     for (raw_name, text_line) in matches:
-        # Map to TTS speaker
         if raw_name.lower() == host_name.lower():
-            # host -> female
             speaker = "Jane"
         elif raw_name.lower() == guest_name.lower():
-            # guest -> male
             speaker = "John"
         else:
-            # unknown -> default to female host
             speaker = "Jane"
         item = DialogueItem(
             speaker=speaker,
@@ -69,36 +72,42 @@ def parse_user_edited_transcript(edited_text: str, host_name: str, guest_name: s
             text=text_line
         )
         items.append(item)
     return items
 def regenerate_audio_from_dialogue(dialogue_items, custom_bg_music_path=None):
     """
     Re-generates multi-speaker audio from user-edited DialogueItems,
-    then mixes with background music (bg_music.mp3) or custom music.
-    Returns final audio bytes and updated transcript (using display_speaker).
     """
     audio_segments = []
     transcript = ""
-    crossfade_duration = 50  # in ms
     for item in dialogue_items:
         audio_file = generate_audio_mp3(item.text, item.speaker)
         seg = AudioSegment.from_file(audio_file, format="mp3")
         audio_segments.append(seg)
-        # Use item.display_speaker for the text transcript
         transcript += f"**{item.display_speaker}**: {item.text}\n\n"
         os.remove(audio_file)
     if not audio_segments:
         return None, "No audio segments were generated."
-    # Combine spoken segments sequentially
     combined_spoken = audio_segments[0]
     for seg in audio_segments[1:]:
         combined_spoken = combined_spoken.append(seg, crossfade=crossfade_duration)
     final_mix = mix_with_bg_music(combined_spoken, custom_bg_music_path)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
         final_mix.export(temp_audio.name, format="mp3")
         final_mp3_path = temp_audio.name
@@ -109,6 +118,7 @@ def regenerate_audio_from_dialogue(dialogue_items, custom_bg_music_path=None):
     return audio_bytes, transcript
 def generate_podcast(
     file,
     url,
@@ -122,21 +132,24 @@ def generate_podcast(
     guest_desc,
     user_specs,
     sponsor_content,
     custom_bg_music_path
 ):
     """
     Creates a multi-speaker podcast from PDF, URL, YouTube, or a research topic.
-    Uses female voice (Jane) for host, male voice (John) for guest.
-    Display_speaker is user-chosen name, speaker is "Jane" or "John".
     Returns (audio_bytes, transcript_str).
     """
     sources = [bool(file), bool(url), bool(video_url), bool(research_topic_input)]
     if sum(sources) > 1:
-        return None, "Provide only one input (PDF, URL, YouTube, or Research topic)."
     if not any(sources):
         return None, "Please provide at least one source."
     text = ""
     if file:
         try:
@@ -168,32 +181,35 @@ def generate_podcast(
         except Exception as e:
             return None, f"Error researching topic: {str(e)}"
-    # Truncate if needed
     text = truncate_text(text)
-    # Build extra instructions
     extra_instructions = []
     if host_name or guest_name:
-        h = f"Host: {host_name or 'Jane'} - {host_desc or 'a curious host'}."
-        g = f"Guest: {guest_name or 'John'} - {guest_desc or 'an expert'}."
-        extra_instructions.append(f"{h}\n{g}")
     if user_specs.strip():
         extra_instructions.append(f"Additional User Instructions: {user_specs}")
     if sponsor_content.strip():
         extra_instructions.append(
-            "Please include a short sponsored advertisement. The sponsor text is as follows:\n"
-            + sponsor_content
         )
     combined_instructions = "\n\n".join(extra_instructions).strip()
     full_prompt = SYSTEM_PROMPT
     if combined_instructions:
         full_prompt += f"\n\n# Additional Instructions\n{combined_instructions}\n"
-    # Use "generate_script" with host/guest name so it can do the mapping
     try:
         script = generate_script(
             full_prompt,
@@ -201,7 +217,8 @@ def generate_podcast(
             tone,
             f"{length_minutes} Mins",
             host_name=host_name or "Jane",
-            guest_name=guest_name or "John"
         )
     except Exception as e:
         return None, f"Error generating script: {str(e)}"
@@ -211,24 +228,28 @@ def generate_podcast(
     crossfade_duration = 50  # ms
     try:
         for item in script.dialogue:
-            # item.speaker is guaranteed "Jane" or "John"
-            # item.display_speaker is the user-facing name
             audio_file = generate_audio_mp3(item.text, item.speaker)
             seg = AudioSegment.from_file(audio_file, format="mp3")
             audio_segments.append(seg)
             transcript += f"**{item.display_speaker}**: {item.text}\n\n"
             os.remove(audio_file)
         if not audio_segments:
             return None, "No audio segments generated."
         combined_spoken = audio_segments[0]
         for seg in audio_segments[1:]:
             combined_spoken = combined_spoken.append(seg, crossfade=crossfade_duration)
         final_mix = mix_with_bg_music(combined_spoken, custom_bg_music_path)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
             final_mix.export(temp_audio.name, format="mp3")
             final_mp3_path = temp_audio.name
@@ -242,10 +263,11 @@ def generate_podcast(
     except Exception as e:
         return None, f"Error generating audio: {str(e)}"
 def highlight_differences(original: str, edited: str) -> str:
     """
     Highlights the differences between the original and edited transcripts.
-    Added or modified words are wrapped in <span> tags with red color.
     """
     matcher = difflib.SequenceMatcher(None, original.split(), edited.split())
     highlighted = []
@@ -256,14 +278,19 @@ def highlight_differences(original: str, edited: str) -> str:
             added_words = edited.split()[j1:j2]
             highlighted.extend([f'<span style="color:red">{word}</span>' for word in added_words])
         elif opcode == 'delete':
             pass
     return ' '.join(highlighted)
 def main():
     st.set_page_config(page_title="MyPod - AI-based Podcast Generator", layout="centered")
     st.markdown("## MyPod - AI powered Podcast Generator")
     st.markdown(
         "Welcome to **MyPod**, your go-to AI-powered podcast generator! 🎉\n\n"
         "MyPod transforms your documents, webpages, YouTube videos, or research topics into a more human-sounding, conversational podcast.\n"
@@ -272,6 +299,7 @@ def main():
         "1. **Provide one source:** PDF Files, Website URL, YouTube link or a Topic to Research.\n"
         "2. **Choose the tone and the target duration.**\n"
         "3. **Click 'Generate Podcast'** to produce your podcast. After the audio is generated, you can edit the transcript and re-generate the audio with your edits if needed.\n\n"
         "**Token Limit:** Up to ~2,048 tokens are supported. Long inputs may be truncated.\n"
         "**Note:** YouTube videos will only work if they have captions built in.\n\n"
         "⏳**Please be patient while your podcast is being generated.** This process involves content analysis, script creation, "
@@ -279,6 +307,7 @@ def main():
         "🔥 **Ready to create your personalized podcast?** Give it a try now and let the magic happen! 🔥"
     )
     col1, col2 = st.columns(2)
     with col1:
         file = st.file_uploader("Upload File (.pdf only)", type=["pdf"])
@@ -289,16 +318,30 @@ def main():
         tone = st.radio("Tone", ["Humorous", "Formal", "Casual", "Youthful"], index=2)
         length_minutes = st.slider("Podcast Length (in minutes)", 1, 60, 3)
     st.markdown("### Customize Your Podcast (New Features)")
     with st.expander("Set Host & Guest Names/Descriptions (Optional)"):
         host_name = st.text_input("Host Name (leave blank for 'Jane')")
         host_desc = st.text_input("Host Description (Optional)")
         guest_name = st.text_input("Guest Name (leave blank for 'John')")
         guest_desc = st.text_input("Guest Description (Optional)")
     user_specs = st.text_area("Any special instructions or prompts for the script? (Optional)", "")
     sponsor_content = st.text_area("Sponsored Content / Ad (Optional)", "")
     custom_bg_music_file = st.file_uploader("Upload Custom Background Music (Optional)", type=["mp3", "wav"])
     custom_bg_music_path = None
     if custom_bg_music_file:
@@ -306,6 +349,7 @@ def main():
             tmp.write(custom_bg_music_file.read())
             custom_bg_music_path = tmp.name
     if "audio_bytes" not in st.session_state:
         st.session_state["audio_bytes"] = None
     if "transcript" not in st.session_state:
@@ -313,32 +357,34 @@ def main():
     if "transcript_original" not in st.session_state:
         st.session_state["transcript_original"] = None
     generate_button = st.button("Generate Podcast")
     if generate_button:
         progress_bar = st.progress(0)
         progress_text = st.empty()
-        messages = [
             "🔍 Analyzing your input...",
             "📝 Crafting the perfect script...",
             "🎙️ Generating high-quality audio...",
             "🎶 Adding the finishing touches..."
         ]
-        progress_text.write(messages[0])
         progress_bar.progress(0)
         time.sleep(1.0)
-        progress_text.write(messages[1])
         progress_bar.progress(25)
         time.sleep(1.0)
-        progress_text.write(messages[2])
         progress_bar.progress(50)
         time.sleep(1.0)
-        progress_text.write(messages[3])
         progress_bar.progress(75)
         time.sleep(1.0)
@@ -355,12 +401,14 @@ def main():
             guest_desc,
             user_specs,
             sponsor_content,
             custom_bg_music_path
         )
         progress_bar.progress(100)
         progress_text.write("✅ Done!")
         if audio_bytes is None:
             st.error(transcript)
             st.session_state["audio_bytes"] = None
@@ -372,6 +420,7 @@ def main():
             st.session_state["transcript"] = transcript
             st.session_state["transcript_original"] = transcript
     if st.session_state["audio_bytes"]:
         st.audio(st.session_state["audio_bytes"], format='audio/mp3')
         st.download_button(
@@ -388,13 +437,15 @@ def main():
             height=300
         )
         if st.session_state["transcript_original"]:
-            highlighted = highlight_differences(
                 st.session_state["transcript_original"],
                 edited_text
             )
             st.markdown("### **Edited Transcript Highlights**", unsafe_allow_html=True)
-            st.markdown(highlighted, unsafe_allow_html=True)
         if st.button("Regenerate Audio From Edited Text"):
             regen_bar = st.progress(0)
@@ -408,9 +459,12 @@ def main():
             regen_bar.progress(50)
             time.sleep(1.0)
-            # Parse lines, map to DialogueItem with correct TTS speaker
-            # host => female (Jane), guest => male (John)
-            dialogue_items = parse_user_edited_transcript(edited_text, host_name or "Jane", guest_name or "John")
             new_audio_bytes, new_transcript = regenerate_audio_from_dialogue(dialogue_items, custom_bg_music_path)
             regen_bar.progress(75)
@@ -438,5 +492,6 @@ def main():
                 st.markdown("### Updated Transcript")
                 st.markdown(new_transcript)
 if __name__ == "__main__":
     main()

     transcribe_youtube_video,
     research_topic,
     mix_with_bg_music,
+    DialogueItem
 )
 from prompts import SYSTEM_PROMPT
 def parse_user_edited_transcript(edited_text: str, host_name: str, guest_name: str):
     """
     Looks for lines like:
         **Angela**: Hello
         **Dimitris**: Great topic...
     We treat 'Angela' as the raw display_speaker, 'Hello' as text.
+    Then we map 'Angela' -> speaker='Jane' (if it matches host_name),
+    'Dimitris' -> speaker='John' (if it matches guest_name), etc.
+    Returns a list of DialogueItem.
     """
+    # Regex pattern to match lines in the format:
+    # **SpeakerName**: Some text here
     pattern = r"\*\*(.+?)\*\*:\s*(.+)"
     matches = re.findall(pattern, edited_text)
     items = []
+    # If no matches found, treat entire text as if it's from the host
     if not matches:
         raw_name = host_name or "Jane"
         text_line = edited_text.strip()
         speaker = "Jane"
+        # If user typed the entire text under the guest's name, switch
         if raw_name.lower() == guest_name.lower():
             speaker = "John"
         item = DialogueItem(
             speaker=speaker,
             display_speaker=raw_name,
         items.append(item)
         return items
+    # Otherwise, we found multiple lines
     for (raw_name, text_line) in matches:
+        # Decide TTS speaker based on the name
         if raw_name.lower() == host_name.lower():
             speaker = "Jane"
         elif raw_name.lower() == guest_name.lower():
             speaker = "John"
         else:
+            # Default to "Jane" if we can't match
             speaker = "Jane"
         item = DialogueItem(
             speaker=speaker,
             text=text_line
         )
         items.append(item)
     return items
 def regenerate_audio_from_dialogue(dialogue_items, custom_bg_music_path=None):
     """
     Re-generates multi-speaker audio from user-edited DialogueItems,
+    then mixes with background music or custom music.
+    Returns (audio_bytes, transcript_str).
     """
     audio_segments = []
     transcript = ""
+    crossfade_duration = 50  # ms
     for item in dialogue_items:
+        # Generate TTS for each line
         audio_file = generate_audio_mp3(item.text, item.speaker)
         seg = AudioSegment.from_file(audio_file, format="mp3")
         audio_segments.append(seg)
+        # Build the updated transcript with the user-facing display_speaker
         transcript += f"**{item.display_speaker}**: {item.text}\n\n"
         os.remove(audio_file)
     if not audio_segments:
         return None, "No audio segments were generated."
+    # Sequentially combine all the speaker segments
     combined_spoken = audio_segments[0]
     for seg in audio_segments[1:]:
         combined_spoken = combined_spoken.append(seg, crossfade=crossfade_duration)
+    # Mix final spoken track with background music
     final_mix = mix_with_bg_music(combined_spoken, custom_bg_music_path)
+    # Export to MP3 bytes
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
         final_mix.export(temp_audio.name, format="mp3")
         final_mp3_path = temp_audio.name
     return audio_bytes, transcript
 def generate_podcast(
     file,
     url,
     guest_desc,
     user_specs,
     sponsor_content,
+    sponsor_style,            # NEW: "Separate Break" or "Blended"
     custom_bg_music_path
 ):
     """
     Creates a multi-speaker podcast from PDF, URL, YouTube, or a research topic.
+    Ensures female voice for host (Jane), male voice for guest (John).
+    Sponsor content is either separate or blended based on sponsor_style.
     Returns (audio_bytes, transcript_str).
     """
+    # Validate only one input source
     sources = [bool(file), bool(url), bool(video_url), bool(research_topic_input)]
     if sum(sources) > 1:
+        return None, "Provide only one input (PDF, URL, YouTube, or Topic)."
     if not any(sources):
         return None, "Please provide at least one source."
+    # Load or extract text
     text = ""
     if file:
         try:
         except Exception as e:
             return None, f"Error researching topic: {str(e)}"
+    # Truncate text if too long
     text = truncate_text(text)
+    # Build extra prompt instructions
     extra_instructions = []
+    # Host/Guest descriptions
     if host_name or guest_name:
+        host_line = f"Host: {host_name or 'Jane'} - {host_desc or 'a curious host'}."
+        guest_line = f"Guest: {guest_name or 'John'} - {guest_desc or 'an expert'}."
+        extra_instructions.append(f"{host_line}\n{guest_line}")
+    # User custom specs
     if user_specs.strip():
         extra_instructions.append(f"Additional User Instructions: {user_specs}")
+    # Sponsor content
     if sponsor_content.strip():
         extra_instructions.append(
+            f"Sponsor Content Provided (under ~30 seconds):\n{sponsor_content}"
         )
+    # Combine all instructions
     combined_instructions = "\n\n".join(extra_instructions).strip()
     full_prompt = SYSTEM_PROMPT
     if combined_instructions:
         full_prompt += f"\n\n# Additional Instructions\n{combined_instructions}\n"
+    # Generate the script
     try:
         script = generate_script(
             full_prompt,
             tone,
             f"{length_minutes} Mins",
             host_name=host_name or "Jane",
+            guest_name=guest_name or "John",
+            sponsor_style=sponsor_style  # pass style to the LLM
         )
     except Exception as e:
         return None, f"Error generating script: {str(e)}"
     crossfade_duration = 50  # ms
     try:
+        # For each dialogue item, do TTS
         for item in script.dialogue:
             audio_file = generate_audio_mp3(item.text, item.speaker)
             seg = AudioSegment.from_file(audio_file, format="mp3")
             audio_segments.append(seg)
+            # Build transcript with display_speaker
             transcript += f"**{item.display_speaker}**: {item.text}\n\n"
             os.remove(audio_file)
         if not audio_segments:
             return None, "No audio segments generated."
+        # Combine speaker segments
         combined_spoken = audio_segments[0]
         for seg in audio_segments[1:]:
             combined_spoken = combined_spoken.append(seg, crossfade=crossfade_duration)
+        # Mix with background music
         final_mix = mix_with_bg_music(combined_spoken, custom_bg_music_path)
+        # Export final to MP3 bytes
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
             final_mix.export(temp_audio.name, format="mp3")
             final_mp3_path = temp_audio.name
     except Exception as e:
         return None, f"Error generating audio: {str(e)}"
 def highlight_differences(original: str, edited: str) -> str:
     """
     Highlights the differences between the original and edited transcripts.
+    Added or replaced words are wrapped in <span style="color:red">...</span>.
     """
     matcher = difflib.SequenceMatcher(None, original.split(), edited.split())
     highlighted = []
             added_words = edited.split()[j1:j2]
             highlighted.extend([f'<span style="color:red">{word}</span>' for word in added_words])
         elif opcode == 'delete':
+            # We ignore deletions
             pass
     return ' '.join(highlighted)
 def main():
+    # Set page config
     st.set_page_config(page_title="MyPod - AI-based Podcast Generator", layout="centered")
+    # Main header
     st.markdown("## MyPod - AI powered Podcast Generator")
+    # Original Intro Explanation
     st.markdown(
         "Welcome to **MyPod**, your go-to AI-powered podcast generator! 🎉\n\n"
         "MyPod transforms your documents, webpages, YouTube videos, or research topics into a more human-sounding, conversational podcast.\n"
         "1. **Provide one source:** PDF Files, Website URL, YouTube link or a Topic to Research.\n"
         "2. **Choose the tone and the target duration.**\n"
         "3. **Click 'Generate Podcast'** to produce your podcast. After the audio is generated, you can edit the transcript and re-generate the audio with your edits if needed.\n\n"
+        "**Research a Topic:** If it's too niche or specific, you might not get the desired outcome.\n\n"
         "**Token Limit:** Up to ~2,048 tokens are supported. Long inputs may be truncated.\n"
         "**Note:** YouTube videos will only work if they have captions built in.\n\n"
         "⏳**Please be patient while your podcast is being generated.** This process involves content analysis, script creation, "
         "🔥 **Ready to create your personalized podcast?** Give it a try now and let the magic happen! 🔥"
     )
+    # Two columns for inputs
     col1, col2 = st.columns(2)
     with col1:
         file = st.file_uploader("Upload File (.pdf only)", type=["pdf"])
         tone = st.radio("Tone", ["Humorous", "Formal", "Casual", "Youthful"], index=2)
         length_minutes = st.slider("Podcast Length (in minutes)", 1, 60, 3)
+    # Additional user customizations
     st.markdown("### Customize Your Podcast (New Features)")
+    # Host & Guest
     with st.expander("Set Host & Guest Names/Descriptions (Optional)"):
         host_name = st.text_input("Host Name (leave blank for 'Jane')")
         host_desc = st.text_input("Host Description (Optional)")
         guest_name = st.text_input("Guest Name (leave blank for 'John')")
         guest_desc = st.text_input("Guest Description (Optional)")
+    # Additional specs
     user_specs = st.text_area("Any special instructions or prompts for the script? (Optional)", "")
+    # Sponsor content
     sponsor_content = st.text_area("Sponsored Content / Ad (Optional)", "")
+    # Sponsor integration style
+    sponsor_style = st.selectbox(
+        "Sponsor Integration Style",
+        ["Separate Break", "Blended"],
+        help="Choose whether to insert sponsor content as a separate ad break or blend it into the conversation."
+    )
+    # Custom background music
     custom_bg_music_file = st.file_uploader("Upload Custom Background Music (Optional)", type=["mp3", "wav"])
     custom_bg_music_path = None
     if custom_bg_music_file:
             tmp.write(custom_bg_music_file.read())
             custom_bg_music_path = tmp.name
+    # Initialize session state if needed
     if "audio_bytes" not in st.session_state:
         st.session_state["audio_bytes"] = None
     if "transcript" not in st.session_state:
     if "transcript_original" not in st.session_state:
         st.session_state["transcript_original"] = None
+    # Generate button
     generate_button = st.button("Generate Podcast")
     if generate_button:
         progress_bar = st.progress(0)
         progress_text = st.empty()
+        progress_messages = [
             "🔍 Analyzing your input...",
             "📝 Crafting the perfect script...",
             "🎙️ Generating high-quality audio...",
             "🎶 Adding the finishing touches..."
         ]
+        # Simulate progress stages
+        progress_text.write(progress_messages[0])
         progress_bar.progress(0)
         time.sleep(1.0)
+        progress_text.write(progress_messages[1])
         progress_bar.progress(25)
         time.sleep(1.0)
+        progress_text.write(progress_messages[2])
         progress_bar.progress(50)
         time.sleep(1.0)
+        progress_text.write(progress_messages[3])
         progress_bar.progress(75)
         time.sleep(1.0)
             guest_desc,
             user_specs,
             sponsor_content,
+            sponsor_style,
             custom_bg_music_path
         )
         progress_bar.progress(100)
         progress_text.write("✅ Done!")
+        # Check results
         if audio_bytes is None:
             st.error(transcript)
             st.session_state["audio_bytes"] = None
             st.session_state["transcript"] = transcript
             st.session_state["transcript_original"] = transcript
+    # If we have audio bytes, display the player and transcript
     if st.session_state["audio_bytes"]:
         st.audio(st.session_state["audio_bytes"], format='audio/mp3')
         st.download_button(
             height=300
         )
+        # Show differences from the original
         if st.session_state["transcript_original"]:
+            highlighted_transcript = highlight_differences(
                 st.session_state["transcript_original"],
                 edited_text
             )
             st.markdown("### **Edited Transcript Highlights**", unsafe_allow_html=True)
+            st.markdown(highlighted_transcript, unsafe_allow_html=True)
         if st.button("Regenerate Audio From Edited Text"):
             regen_bar = st.progress(0)
             regen_bar.progress(50)
             time.sleep(1.0)
+            # Parse the user-edited transcript
+            dialogue_items = parse_user_edited_transcript(
+                edited_text,
+                host_name or "Jane",
+                guest_name or "John"
+            )
             new_audio_bytes, new_transcript = regenerate_audio_from_dialogue(dialogue_items, custom_bg_music_path)
             regen_bar.progress(75)
                 st.markdown("### Updated Transcript")
                 st.markdown(new_transcript)
 if __name__ == "__main__":
     main()