MyPod_10

Running

App Files Files Community

MyPod_10 / app.py

siddhartharyaai

Update app.py

cf2b563 verified 4 months ago

raw

history blame

16.5 kB

	# app.py

	import streamlit as st
	import time
	import re
	import os
	import tempfile
	import pypdf
	from pydub import AudioSegment, effects
	import difflib # For computing differences between texts

	from utils import (
	generate_script,
	generate_audio_mp3,
	truncate_text,
	extract_text_from_url,
	transcribe_youtube_video,
	research_topic,
	mix_with_bg_music,
	DialogueItem # so we can construct items
	)
	from prompts import SYSTEM_PROMPT

	def parse_user_edited_transcript(edited_text: str, host_name: str, guest_name: str):
	"""
	Looks for lines like:
	Angela: Hello
	Dimitris: Great topic...
	We treat 'Angela' as the raw display_speaker, 'Hello' as text.
	Then we map 'Angela' -> speaker='Jane' if it matches host_name (case-insensitive),
	'Dimitris' -> speaker='John' if it matches guest_name, else default to 'Jane'.
	Returns a list of (DialogueItem).
	"""
	pattern = r"\\(.+?)\\:\s*(.+)"
	matches = re.findall(pattern, edited_text)

	items = []
	if not matches:
	# No lines found, treat entire text as if it's host
	raw_name = host_name or "Jane"
	text_line = edited_text.strip()
	speaker = "Jane"
	if raw_name.lower() == guest_name.lower():
	speaker = "John"
	# build a single item
	item = DialogueItem(
	speaker=speaker,
	display_speaker=raw_name,
	text=text_line
	)
	items.append(item)
	return items

	# If we have multiple lines
	for (raw_name, text_line) in matches:
	# Map to TTS speaker
	if raw_name.lower() == host_name.lower():
	# host -> female
	speaker = "Jane"
	elif raw_name.lower() == guest_name.lower():
	# guest -> male
	speaker = "John"
	else:
	# unknown -> default to female host
	speaker = "Jane"
	item = DialogueItem(
	speaker=speaker,
	display_speaker=raw_name,
	text=text_line
	)
	items.append(item)
	return items

	def regenerate_audio_from_dialogue(dialogue_items, custom_bg_music_path=None):
	"""
	Re-generates multi-speaker audio from user-edited DialogueItems,
	then mixes with background music (bg_music.mp3) or custom music.
	Returns final audio bytes and updated transcript (using display_speaker).
	"""
	audio_segments = []
	transcript = ""
	crossfade_duration = 50 # in ms

	for item in dialogue_items:
	audio_file = generate_audio_mp3(item.text, item.speaker)
	seg = AudioSegment.from_file(audio_file, format="mp3")
	audio_segments.append(seg)
	# Use item.display_speaker for the text transcript
	transcript += f"{item.display_speaker}: {item.text}\n\n"
	os.remove(audio_file)

	if not audio_segments:
	return None, "No audio segments were generated."

	# Combine spoken segments sequentially
	combined_spoken = audio_segments[0]
	for seg in audio_segments[1:]:
	combined_spoken = combined_spoken.append(seg, crossfade=crossfade_duration)

	final_mix = mix_with_bg_music(combined_spoken, custom_bg_music_path)

	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
	final_mix.export(temp_audio.name, format="mp3")
	final_mp3_path = temp_audio.name

	with open(final_mp3_path, "rb") as f:
	audio_bytes = f.read()
	os.remove(final_mp3_path)

	return audio_bytes, transcript

	def generate_podcast(
	file,
	url,
	video_url,
	research_topic_input,
	tone,
	length_minutes,
	host_name,
	host_desc,
	guest_name,
	guest_desc,
	user_specs,
	sponsor_content,
	custom_bg_music_path
	):
	"""
	Creates a multi-speaker podcast from PDF, URL, YouTube, or a research topic.
	Uses female voice (Jane) for host, male voice (John) for guest.
	Display_speaker is user-chosen name, speaker is "Jane" or "John".

	Returns (audio_bytes, transcript_str).
	"""
	sources = [bool(file), bool(url), bool(video_url), bool(research_topic_input)]
	if sum(sources) > 1:
	return None, "Provide only one input (PDF, URL, YouTube, or Research topic)."
	if not any(sources):
	return None, "Please provide at least one source."

	text = ""
	if file:
	try:
	if not file.name.lower().endswith('.pdf'):
	return None, "Please upload a PDF file."
	reader = pypdf.PdfReader(file)
	text = " ".join(page.extract_text() for page in reader.pages if page.extract_text())
	except Exception as e:
	return None, f"Error reading PDF: {str(e)}"
	elif url:
	try:
	text = extract_text_from_url(url)
	if not text:
	return None, "Failed to extract text from URL."
	except Exception as e:
	return None, f"Error extracting text from URL: {str(e)}"
	elif video_url:
	try:
	text = transcribe_youtube_video(video_url)
	if not text:
	return None, "Failed to transcribe YouTube video."
	except Exception as e:
	return None, f"Error transcribing YouTube video: {str(e)}"
	elif research_topic_input:
	try:
	text = research_topic(research_topic_input)
	if not text:
	return None, f"Sorry, no information found on '{research_topic_input}'."
	except Exception as e:
	return None, f"Error researching topic: {str(e)}"

	# Truncate if needed
	text = truncate_text(text)

	# Build extra instructions
	extra_instructions = []

	if host_name or guest_name:
	h = f"Host: {host_name or 'Jane'} - {host_desc or 'a curious host'}."
	g = f"Guest: {guest_name or 'John'} - {guest_desc or 'an expert'}."
	extra_instructions.append(f"{h}\n{g}")

	if user_specs.strip():
	extra_instructions.append(f"Additional User Instructions: {user_specs}")

	if sponsor_content.strip():
	extra_instructions.append(
	"Please include a short sponsored advertisement. The sponsor text is as follows:\n"
	+ sponsor_content
	)

	combined_instructions = "\n\n".join(extra_instructions).strip()
	full_prompt = SYSTEM_PROMPT
	if combined_instructions:
	full_prompt += f"\n\n# Additional Instructions\n{combined_instructions}\n"

	# Use "generate_script" with host/guest name so it can do the mapping
	try:
	script = generate_script(
	full_prompt,
	text,
	tone,
	f"{length_minutes} Mins",
	host_name=host_name or "Jane",
	guest_name=guest_name or "John"
	)
	except Exception as e:
	return None, f"Error generating script: {str(e)}"

	audio_segments = []
	transcript = ""
	crossfade_duration = 50 # ms

	try:
	for item in script.dialogue:
	# item.speaker is guaranteed "Jane" or "John"
	# item.display_speaker is the user-facing name
	audio_file = generate_audio_mp3(item.text, item.speaker)
	seg = AudioSegment.from_file(audio_file, format="mp3")
	audio_segments.append(seg)
	transcript += f"{item.display_speaker}: {item.text}\n\n"
	os.remove(audio_file)

	if not audio_segments:
	return None, "No audio segments generated."

	combined_spoken = audio_segments[0]
	for seg in audio_segments[1:]:
	combined_spoken = combined_spoken.append(seg, crossfade=crossfade_duration)

	final_mix = mix_with_bg_music(combined_spoken, custom_bg_music_path)

	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
	final_mix.export(temp_audio.name, format="mp3")
	final_mp3_path = temp_audio.name

	with open(final_mp3_path, "rb") as f:
	audio_bytes = f.read()
	os.remove(final_mp3_path)

	return audio_bytes, transcript

	except Exception as e:
	return None, f"Error generating audio: {str(e)}"

	def highlight_differences(original: str, edited: str) -> str:
	"""
	Highlights the differences between the original and edited transcripts.
	Added or modified words are wrapped in <span> tags with red color.
	"""
	matcher = difflib.SequenceMatcher(None, original.split(), edited.split())
	highlighted = []
	for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
	if opcode == 'equal':
	highlighted.extend(original.split()[i1:i2])
	elif opcode in ('replace', 'insert'):
	added_words = edited.split()[j1:j2]
	highlighted.extend([f'<span style="color:red">{word}</span>' for word in added_words])
	elif opcode == 'delete':
	pass
	return ' '.join(highlighted)

	def main():
	st.set_page_config(page_title="MyPod - AI-based Podcast Generator", layout="centered")

	st.markdown("## MyPod - AI powered Podcast Generator")

	st.markdown(
	"Welcome to MyPod, your go-to AI-powered podcast generator! 🎉\n\n"
	"MyPod transforms your documents, webpages, YouTube videos, or research topics into a more human-sounding, conversational podcast.\n"
	"Select a tone and a duration range. The script will be on-topic, concise, and respect your chosen length.\n\n"
	"### How to use:\n"
	"1. Provide one source: PDF Files, Website URL, YouTube link or a Topic to Research.\n"
	"2. Choose the tone and the target duration.\n"
	"3. Click 'Generate Podcast' to produce your podcast. After the audio is generated, you can edit the transcript and re-generate the audio with your edits if needed.\n\n"
	"Token Limit: Up to ~2,048 tokens are supported. Long inputs may be truncated.\n"
	"Note: YouTube videos will only work if they have captions built in.\n\n"
	"⏳Please be patient while your podcast is being generated. This process involves content analysis, script creation, "
	"and high-quality audio synthesis, which may take a few minutes.\n\n"
	"🔥 Ready to create your personalized podcast? Give it a try now and let the magic happen! 🔥"
	)

	col1, col2 = st.columns(2)
	with col1:
	file = st.file_uploader("Upload File (.pdf only)", type=["pdf"])
	url = st.text_input("Or Enter Website URL")
	video_url = st.text_input("Or Enter YouTube Link (Captioned videos)")
	with col2:
	research_topic_input = st.text_input("Or Research a Topic")
	tone = st.radio("Tone", ["Humorous", "Formal", "Casual", "Youthful"], index=2)
	length_minutes = st.slider("Podcast Length (in minutes)", 1, 60, 3)

	st.markdown("### Customize Your Podcast (New Features)")
	with st.expander("Set Host & Guest Names/Descriptions (Optional)"):
	host_name = st.text_input("Host Name (leave blank for 'Jane')")
	host_desc = st.text_input("Host Description (Optional)")
	guest_name = st.text_input("Guest Name (leave blank for 'John')")
	guest_desc = st.text_input("Guest Description (Optional)")

	user_specs = st.text_area("Any special instructions or prompts for the script? (Optional)", "")
	sponsor_content = st.text_area("Sponsored Content / Ad (Optional)", "")

	custom_bg_music_file = st.file_uploader("Upload Custom Background Music (Optional)", type=["mp3", "wav"])
	custom_bg_music_path = None
	if custom_bg_music_file:
	with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(custom_bg_music_file.name)[1]) as tmp:
	tmp.write(custom_bg_music_file.read())
	custom_bg_music_path = tmp.name

	if "audio_bytes" not in st.session_state:
	st.session_state["audio_bytes"] = None
	if "transcript" not in st.session_state:
	st.session_state["transcript"] = None
	if "transcript_original" not in st.session_state:
	st.session_state["transcript_original"] = None

	generate_button = st.button("Generate Podcast")

	if generate_button:
	progress_bar = st.progress(0)
	progress_text = st.empty()

	messages = [
	"🔍 Analyzing your input...",
	"📝 Crafting the perfect script...",
	"🎙️ Generating high-quality audio...",
	"🎶 Adding the finishing touches..."
	]

	progress_text.write(messages[0])
	progress_bar.progress(0)
	time.sleep(1.0)

	progress_text.write(messages[1])
	progress_bar.progress(25)
	time.sleep(1.0)

	progress_text.write(messages[2])
	progress_bar.progress(50)
	time.sleep(1.0)

	progress_text.write(messages[3])
	progress_bar.progress(75)
	time.sleep(1.0)

	audio_bytes, transcript = generate_podcast(
	file,
	url,
	video_url,
	research_topic_input,
	tone,
	length_minutes,
	host_name,
	host_desc,
	guest_name,
	guest_desc,
	user_specs,
	sponsor_content,
	custom_bg_music_path
	)

	progress_bar.progress(100)
	progress_text.write("✅ Done!")

	if audio_bytes is None:
	st.error(transcript)
	st.session_state["audio_bytes"] = None
	st.session_state["transcript"] = None
	st.session_state["transcript_original"] = None
	else:
	st.success("Podcast generated successfully!")
	st.session_state["audio_bytes"] = audio_bytes
	st.session_state["transcript"] = transcript
	st.session_state["transcript_original"] = transcript

	if st.session_state["audio_bytes"]:
	st.audio(st.session_state["audio_bytes"], format='audio/mp3')
	st.download_button(
	label="Download Podcast (MP3)",
	data=st.session_state["audio_bytes"],
	file_name="my_podcast.mp3",
	mime="audio/mpeg"
	)

	st.markdown("### Generated Transcript (Editable)")
	edited_text = st.text_area(
	"Feel free to tweak lines, fix errors, or reword anything.",
	value=st.session_state["transcript"],
	height=300
	)

	if st.session_state["transcript_original"]:
	highlighted = highlight_differences(
	st.session_state["transcript_original"],
	edited_text
	)
	st.markdown("### Edited Transcript Highlights", unsafe_allow_html=True)
	st.markdown(highlighted, unsafe_allow_html=True)

	if st.button("Regenerate Audio From Edited Text"):
	regen_bar = st.progress(0)
	regen_text = st.empty()

	regen_text.write("🔄 Regenerating your podcast with the edits...")
	regen_bar.progress(25)
	time.sleep(1.0)

	regen_text.write("🔧 Adjusting the script based on your changes...")
	regen_bar.progress(50)
	time.sleep(1.0)

	# Parse lines, map to DialogueItem with correct TTS speaker
	# host => female (Jane), guest => male (John)
	dialogue_items = parse_user_edited_transcript(edited_text, host_name or "Jane", guest_name or "John")
	new_audio_bytes, new_transcript = regenerate_audio_from_dialogue(dialogue_items, custom_bg_music_path)

	regen_bar.progress(75)
	time.sleep(1.0)

	if new_audio_bytes is None:
	regen_bar.progress(100)
	st.error(new_transcript)
	else:
	regen_bar.progress(100)
	regen_text.write("✅ Regeneration complete!")
	st.success("Regenerated audio below:")

	st.session_state["audio_bytes"] = new_audio_bytes
	st.session_state["transcript"] = new_transcript
	st.session_state["transcript_original"] = new_transcript

	st.audio(new_audio_bytes, format='audio/mp3')
	st.download_button(
	label="Download Edited Podcast (MP3)",
	data=new_audio_bytes,
	file_name="my_podcast_edited.mp3",
	mime="audio/mpeg"
	)
	st.markdown("### Updated Transcript")
	st.markdown(new_transcript)

	if __name__ == "__main__":
	main()