MyPod_10 / app.py
siddhartharyaai's picture
Update app.py
31ff046 verified
raw
history blame
12 kB
# app.py
import streamlit as st
import time
import re
import os
import tempfile
import pypdf
from pydub import AudioSegment
from utils import (
generate_script,
generate_audio_mp3,
truncate_text,
extract_text_from_url,
transcribe_youtube_video,
research_topic
)
from prompts import SYSTEM_PROMPT
def parse_user_edited_transcript(edited_text: str):
"""
Looks for lines like:
**Jane**: Hello
**John**: Sure, I'd love to talk about that.
Returns a list of (speaker, text).
"""
pattern = r"\*\*(Jane|John)\*\*:\s*(.+)"
matches = re.findall(pattern, edited_text)
if not matches:
# If user changed the format drastically, treat entire text as Jane
return [("Jane", edited_text)]
return matches
def regenerate_audio_from_dialogue(dialogue_items):
"""
Re-generates multi-speaker audio from user-edited text.
Returns final audio bytes and updated transcript.
"""
audio_segments = []
transcript = ""
crossfade_duration = 50 # in ms
for speaker, line_text in dialogue_items:
audio_file = generate_audio_mp3(line_text, speaker)
seg = AudioSegment.from_file(audio_file, format="mp3")
audio_segments.append(seg)
transcript += f"**{speaker}**: {line_text}\n\n"
os.remove(audio_file)
if not audio_segments:
return None, "No audio segments were generated."
# Combine with crossfade
combined = audio_segments[0]
for seg in audio_segments[1:]:
combined = combined.append(seg, crossfade=crossfade_duration)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
combined.export(temp_audio.name, format="mp3")
final_mp3_path = temp_audio.name
# Read bytes and return them (so we have a real .mp3 to download)
with open(final_mp3_path, "rb") as f:
audio_bytes = f.read()
os.remove(final_mp3_path)
return audio_bytes, transcript
def generate_podcast(file, url, video_url, research_topic_input, tone, length):
"""
Creates a multi-speaker podcast from:
- PDF
- URL
- YouTube video
- or a research topic input.
Returns (audio_bytes, transcript_str).
"""
sources = [bool(file), bool(url), bool(video_url), bool(research_topic_input)]
if sum(sources) > 1:
return None, "Provide only one input (PDF, URL, YouTube, or Research topic)."
if not any(sources):
return None, "Please provide at least one source."
text = ""
if file:
# Handle PDF
try:
if not file.name.lower().endswith('.pdf'):
return None, "Please upload a PDF file."
reader = pypdf.PdfReader(file.name)
text = " ".join(page.extract_text() for page in reader.pages if page.extract_text())
except Exception as e:
return None, f"Error reading PDF: {str(e)}"
elif url:
# Handle URL
try:
text = extract_text_from_url(url)
if not text:
return None, "Failed to extract text from URL."
except Exception as e:
return None, f"Error extracting text from URL: {str(e)}"
elif video_url:
# Handle YouTube
try:
text = transcribe_youtube_video(video_url)
if not text:
return None, "Failed to transcribe YouTube video."
except Exception as e:
return None, f"Error transcribing YouTube video: {str(e)}"
elif research_topic_input:
# Handle research topic
try:
text = research_topic(research_topic_input)
if not text:
return None, f"Sorry, no information found on '{research_topic_input}'."
except Exception as e:
return None, f"Error researching topic: {str(e)}"
# Generate the multi-speaker script
try:
text = truncate_text(text)
script = generate_script(SYSTEM_PROMPT, text, tone, length)
except Exception as e:
return None, f"Error generating script: {str(e)}"
audio_segments = []
transcript = ""
crossfade_duration = 50 # ms
try:
for item in script.dialogue:
audio_file = generate_audio_mp3(item.text, item.speaker)
seg = AudioSegment.from_file(audio_file, format="mp3")
audio_segments.append(seg)
transcript += f"**{item.speaker}**: {item.text}\n\n"
os.remove(audio_file)
if not audio_segments:
return None, "No audio segments generated."
combined = audio_segments[0]
for seg in audio_segments[1:]:
combined = combined.append(seg, crossfade=crossfade_duration)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
combined.export(temp_audio.name, format="mp3")
final_mp3_path = temp_audio.name
# Convert final mp3 to bytes
with open(final_mp3_path, "rb") as f:
audio_bytes = f.read()
os.remove(final_mp3_path)
return audio_bytes, transcript
except Exception as e:
return None, f"Error generating audio: {str(e)}"
def main():
# Moved set_page_config to the very top of all Streamlit commands
st.set_page_config(
page_title="MyPod - AI-based Podcast Generator",
layout="centered"
)
# Enable "light or dark" theme via custom CSS
st.markdown(
"""
<style>
:root {
color-scheme: light dark;
}
body {
background-color: #f0f2f6;
color: #222;
}
.css-18e3th9 {
background-color: #e8eaf2;
}
.stButton>button {
background-color: #0066cc;
color: white;
border-radius: 8px;
}
.stProgress>div>div>div>div {
background-color: #0066cc;
}
</style>
""",
unsafe_allow_html=True
)
st.title("🎙 MyPod - AI-based Podcast Generator")
st.markdown(
"Welcome to **MyPod**, your go-to AI-powered podcast generator! 🎉\n\n"
"MyPod transforms your documents, webpages, YouTube videos, or research topics into a more human-sounding, conversational podcast.\n"
"Select a tone and a duration range. The script will be on-topic, concise, and respect your chosen length.\n\n"
"### How to use:\n"
"1. **Provide one source:** PDF, URL, YouTube link (Requires User Auth - Work in Progress), or a Topic to Research.\n"
"2. **Choose the tone and the target duration.**\n"
"3. **Click 'Generate Podcast'** to produce your podcast.\n\n"
"**After** the audio is generated, you can **edit** the transcript \n"
"and **re-generate** the audio with your edits if needed.\n\n"
"**Research a Topic:** Please be as detailed as possible in your topic statement. If it's too niche or specific, "
"you might not get the desired outcome. We'll fetch information from Wikipedia and RSS feeds (BBC, CNN, Associated Press, "
"NDTV, Times of India, The Hindu, Economic Times, Google News) or the LLM knowledge base to get recent info about the topic.\n\n"
"**Token Limit:** Up to ~2,048 tokens are supported. Long inputs may be truncated.\n"
"**Note:** YouTube transcription uses Whisper on CPU and may take longer for very long videos.\n\n"
"⏳**Please be patient while your podcast is being generated.** This process involves content analysis, script creation, "
"and high-quality audio synthesis, which may take a few minutes.\n\n"
"🔥 **Ready to create your personalized podcast?** Give it a try now and let the magic happen! 🔥"
)
col1, col2 = st.columns(2)
with col1:
file = st.file_uploader("Upload PDF (.pdf only)", type=["pdf"])
url = st.text_input("Or Enter URL")
video_url = st.text_input("Or Enter YouTube Link")
with col2:
research_topic_input = st.text_input("Or Research a Topic")
tone = st.radio("Tone", ["Humorous", "Formal", "Casual", "Youthful"], index=2)
length = st.radio("Length", ["1-3 Mins", "3-5 Mins", "5-10 Mins", "10-20 Mins"], index=0)
# Use session_state to avoid losing results if user clicks away
if "audio_bytes" not in st.session_state:
st.session_state["audio_bytes"] = None
if "transcript" not in st.session_state:
st.session_state["transcript"] = None
generate_button = st.button("Generate Podcast")
if generate_button:
# Show a pseudo progress bar for user engagement
progress_bar = st.progress(0)
progress_text = st.empty()
# Steps to pretend some progress:
progress_text.write("Alright, let's get started...")
progress_bar.progress(10)
time.sleep(1.0)
progress_text.write("Working on the magic. Hang tight!")
progress_bar.progress(40)
time.sleep(1.0)
progress_text.write("Almost done. Adding a dash of awesomeness...")
progress_bar.progress(70)
time.sleep(1.0)
audio_bytes, transcript = generate_podcast(
file, url, video_url, research_topic_input, tone, length
)
time.sleep(1.0)
progress_bar.progress(100)
progress_text.write("Done!")
if audio_bytes is None:
st.error(transcript)
# Clear session state
st.session_state["audio_bytes"] = None
st.session_state["transcript"] = None
else:
st.success("Podcast generated successfully!")
st.session_state["audio_bytes"] = audio_bytes
st.session_state["transcript"] = transcript
# Check if we have a stored result
if st.session_state["audio_bytes"]:
# Show the audio
st.audio(st.session_state["audio_bytes"], format='audio/mp3')
# Provide a download button with .mp3 extension
st.download_button(
label="Download Podcast (MP3)",
data=st.session_state["audio_bytes"],
file_name="my_podcast.mp3",
mime="audio/mpeg"
)
# Show the transcript in a text area for editing
st.markdown("### Generated Transcript (Editable)")
edited_text = st.text_area(
"Feel free to tweak lines, fix errors, or reword anything.",
value=st.session_state["transcript"],
height=300
)
# Regenerate button
if st.button("Regenerate Audio From Edited Text"):
regen_bar = st.progress(0)
regen_text = st.empty()
regen_text.write("Let's do this revision!")
regen_bar.progress(25)
time.sleep(1.0)
regen_text.write("Cooking up fresh audio...")
regen_bar.progress(60)
time.sleep(1.0)
# Parse & regenerate
dialogue_items = parse_user_edited_transcript(edited_text)
new_audio_bytes, new_transcript = regenerate_audio_from_dialogue(dialogue_items)
regen_bar.progress(90)
time.sleep(1.0)
if new_audio_bytes is None:
regen_bar.progress(100)
st.error(new_transcript)
else:
regen_bar.progress(100)
regen_text.write("All set!")
st.success("Regenerated audio below:")
# Store updated
st.session_state["audio_bytes"] = new_audio_bytes
st.session_state["transcript"] = new_transcript
st.audio(new_audio_bytes, format='audio/mp3')
st.download_button(
label="Download Edited Podcast (MP3)",
data=new_audio_bytes,
file_name="my_podcast_edited.mp3",
mime="audio/mpeg"
)
st.markdown(new_transcript)
if __name__ == "__main__":
main()