MyPod_10

Running

App Files Files Community

MyPod_10 / app.py

siddhartharyaai

Update app.py

15d59d2 verified 4 months ago

raw

history blame

20.1 kB

	# app.py

	import streamlit as st
	import time
	import re
	import os
	import tempfile
	import pypdf
	from pydub import AudioSegment, effects
	import difflib

	from utils import (
	generate_script,
	generate_audio_mp3,
	truncate_text,
	extract_text_from_url,
	transcribe_youtube_video,
	research_topic,
	mix_with_bg_music,
	DialogueItem
	)
	from prompts import SYSTEM_PROMPT

	# The new Q&A with mic
	from qa import AudioBufferProcessor, handle_qa_exchange, transcribe_audio_deepgram

	from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration

	MAX_QA_QUESTIONS = 5

	def parse_user_edited_transcript(edited_text: str, host_name: str, guest_name: str):
	pattern = r"\\(.+?)\\:\s*(.+)"
	matches = re.findall(pattern, edited_text)

	items = []
	if not matches:
	raw_name = host_name or "Jane"
	text_line = edited_text.strip()
	speaker = "Jane"
	if raw_name.lower() == guest_name.lower():
	speaker = "John"
	item = DialogueItem(
	speaker=speaker,
	display_speaker=raw_name,
	text=text_line
	)
	items.append(item)
	return items

	for (raw_name, text_line) in matches:
	if raw_name.lower() == host_name.lower():
	speaker = "Jane"
	elif raw_name.lower() == guest_name.lower():
	speaker = "John"
	else:
	speaker = "Jane"
	item = DialogueItem(
	speaker=speaker,
	display_speaker=raw_name,
	text=text_line
	)
	items.append(item)
	return items

	def regenerate_audio_from_dialogue(dialogue_items, custom_bg_music_path=None):
	audio_segments = []
	transcript = ""
	crossfade_duration = 50

	for item in dialogue_items:
	audio_file = generate_audio_mp3(item.text, item.speaker)
	seg = AudioSegment.from_file(audio_file, format="mp3")
	audio_segments.append(seg)
	transcript += f"{item.display_speaker}: {item.text}\n\n"
	os.remove(audio_file)

	if not audio_segments:
	return None, "No audio segments were generated."

	combined_spoken = audio_segments[0]
	for seg in audio_segments[1:]:
	combined_spoken = combined_spoken.append(seg, crossfade=crossfade_duration)

	final_mix = mix_with_bg_music(combined_spoken, custom_bg_music_path)

	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
	final_mix.export(temp_audio.name, format="mp3")
	final_mp3_path = temp_audio.name

	with open(final_mp3_path, "rb") as f:
	audio_bytes = f.read()
	os.remove(final_mp3_path)

	return audio_bytes, transcript

	def generate_podcast(
	file,
	url,
	video_url,
	research_topic_input,
	tone,
	length_minutes,
	host_name,
	host_desc,
	guest_name,
	guest_desc,
	user_specs,
	sponsor_content,
	sponsor_style,
	custom_bg_music_path
	):
	sources = [bool(file), bool(url), bool(video_url), bool(research_topic_input)]
	if sum(sources) > 1:
	return None, "Provide only one input (PDF, URL, YouTube, or Topic)."
	if not any(sources):
	return None, "Please provide at least one source."

	text = ""
	if file:
	try:
	if not file.name.lower().endswith('.pdf'):
	return None, "Please upload a PDF file."
	reader = pypdf.PdfReader(file)
	text = " ".join(page.extract_text() for page in reader.pages if page.extract_text())
	except Exception as e:
	return None, f"Error reading PDF: {str(e)}"
	elif url:
	try:
	text = extract_text_from_url(url)
	if not text:
	return None, "Failed to extract text from URL."
	except Exception as e:
	return None, f"Error extracting text from URL: {str(e)}"
	elif video_url:
	try:
	text = transcribe_youtube_video(video_url)
	if not text:
	return None, "Failed to transcribe YouTube video."
	except Exception as e:
	return None, f"Error transcribing YouTube video: {str(e)}"
	elif research_topic_input:
	try:
	text = research_topic(research_topic_input)
	if not text:
	return None, f"Sorry, no information found on '{research_topic_input}'."
	except Exception as e:
	return None, f"Error researching topic: {str(e)}"

	from utils import truncate_text
	text = truncate_text(text)

	extra_instructions = []
	if host_name or guest_name:
	host_line = f"Host: {host_name or 'Jane'} - {host_desc or 'a curious host'}."
	guest_line = f"Guest: {guest_name or 'John'} - {guest_desc or 'an expert'}."
	extra_instructions.append(f"{host_line}\n{guest_line}")

	if user_specs.strip():
	extra_instructions.append(f"Additional User Instructions: {user_specs}")

	if sponsor_content.strip():
	extra_instructions.append(
	f"Sponsor Content Provided (should be under ~30 seconds):\n{sponsor_content}"
	)

	from prompts import SYSTEM_PROMPT
	from utils import generate_script, generate_audio_mp3, mix_with_bg_music
	combined_instructions = "\n\n".join(extra_instructions).strip()
	full_prompt = SYSTEM_PROMPT
	if combined_instructions:
	full_prompt += f"\n\n# Additional Instructions\n{combined_instructions}\n"

	try:
	script = generate_script(
	full_prompt,
	text,
	tone,
	f"{length_minutes} Mins",
	host_name=host_name or "Jane",
	guest_name=guest_name or "John",
	sponsor_style=sponsor_style
	)
	except Exception as e:
	return None, f"Error generating script: {str(e)}"

	audio_segments = []
	transcript = ""
	crossfade_duration = 50

	try:
	for item in script.dialogue:
	audio_file = generate_audio_mp3(item.text, item.speaker)
	seg = AudioSegment.from_file(audio_file, format="mp3")
	audio_segments.append(seg)
	transcript += f"{item.display_speaker}: {item.text}\n\n"
	os.remove(audio_file)

	if not audio_segments:
	return None, "No audio segments generated."

	combined_spoken = audio_segments[0]
	for seg in audio_segments[1:]:
	combined_spoken = combined_spoken.append(seg, crossfade=crossfade_duration)

	final_mix = mix_with_bg_music(combined_spoken, custom_bg_music_path)

	import tempfile
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
	final_mix.export(temp_audio.name, format="mp3")
	final_mp3_path = temp_audio.name

	with open(final_mp3_path, "rb") as f:
	audio_bytes = f.read()
	os.remove(final_mp3_path)

	return audio_bytes, transcript
	except Exception as e:
	return None, f"Error generating audio: {str(e)}"

	def highlight_differences(original: str, edited: str) -> str:
	matcher = difflib.SequenceMatcher(None, original.split(), edited.split())
	highlighted = []
	for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
	if opcode == 'equal':
	highlighted.extend(original.split()[i1:i2])
	elif opcode in ('replace', 'insert'):
	added_words = edited.split()[j1:j2]
	highlighted.extend([f'<span style="color:red">{word}</span>' for word in added_words])
	elif opcode == 'delete':
	pass
	return ' '.join(highlighted)

	def main():
	st.set_page_config(
	page_title="MyPod - AI-based Podcast Generator",
	layout="centered"
	)

	logo_col, title_col = st.columns([1, 10])
	with logo_col:
	st.image("logomypod.jpg", width=60)
	with title_col:
	st.markdown("## MyPod - AI powered Podcast Generator")

	st.markdown(
	"Welcome to MyPod, your go-to AI-powered podcast generator! 🎉\n\n"
	"MyPod transforms your documents, webpages, YouTube videos, or research topics into a more human-sounding, "
	"conversational podcast.\n"
	"Select a tone and a duration range. The script will be on-topic, concise, and respect your chosen length.\n\n"
	"### How to use:\n"
	"1. Provide one source: PDF Files, Website URL, YouTube videos, or a Topic to Research.\n"
	"2. Choose the tone and the target duration.\n"
	"3. Click 'Generate Podcast' to produce your podcast. After the audio is generated, "
	" you can edit the transcript and re-generate the audio with your edits if needed.\n\n"
	"Research a Topic: If it's too niche or specific, you might not get the desired outcome.\n\n"
	"Token Limit: Up to ~2,048 tokens are supported. Long inputs may be truncated.\n"
	"Note: YouTube videos will only work if they have captions built in.\n\n"
	"⏳Please be patient while your podcast is being generated. This process involves content analysis, "
	"script creation, and high-quality audio synthesis, which may take a few minutes.\n\n"
	"🔥 Ready to create your personalized podcast? Give it a try now and let the magic happen! 🔥"
	)

	col1, col2 = st.columns(2)
	with col1:
	file = st.file_uploader("Upload File (.pdf only)", type=["pdf"])
	url = st.text_input("Or Enter Website URL")
	video_url = st.text_input("Or Enter YouTube Link (Captioned videos)")
	with col2:
	research_topic_input = st.text_input("Or Research a Topic")
	tone = st.radio("Tone", ["Humorous", "Formal", "Casual", "Youthful"], index=2)
	length_minutes = st.slider("Podcast Length (in minutes)", 1, 60, 3)

	st.markdown("### Customize Your Podcast (New Features)")

	with st.expander("Set Host & Guest Names/Descriptions (Optional)"):
	host_name = st.text_input("Host Name (leave blank for 'Jane')")
	host_desc = st.text_input("Host Description (Optional)")
	guest_name = st.text_input("Guest Name (leave blank for 'John')")
	guest_desc = st.text_input("Guest Description (Optional)")

	user_specs = st.text_area("Any special instructions or prompts for the script? (Optional)", "")
	sponsor_content = st.text_area("Sponsored Content / Ad (Optional)", "")
	sponsor_style = st.selectbox(
	"Sponsor Integration Style",
	["Separate Break", "Blended"]
	)

	custom_bg_music_file = st.file_uploader("Upload Custom Background Music (Optional)", type=["mp3", "wav"])
	custom_bg_music_path = None
	if custom_bg_music_file:
	with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(custom_bg_music_file.name)[1]) as tmp:
	tmp.write(custom_bg_music_file.read())
	custom_bg_music_path = tmp.name

	if "audio_bytes" not in st.session_state:
	st.session_state["audio_bytes"] = None
	if "transcript" not in st.session_state:
	st.session_state["transcript"] = None
	if "transcript_original" not in st.session_state:
	st.session_state["transcript_original"] = None

	# For Q&A
	if "qa_count" not in st.session_state:
	st.session_state["qa_count"] = 0
	if "conversation_history" not in st.session_state:
	st.session_state["conversation_history"] = ""

	generate_button = st.button("Generate Podcast")

	if generate_button:
	progress_bar = st.progress(0)
	progress_text = st.empty()

	progress_messages = [
	"🔍 Analyzing your input...",
	"📝 Crafting the perfect script...",
	"🎙️ Generating high-quality audio...",
	"🎶 Adding the finishing touches..."
	]

	progress_text.write(progress_messages[0])
	progress_bar.progress(0)
	time.sleep(1.0)

	progress_text.write(progress_messages[1])
	progress_bar.progress(25)
	time.sleep(1.0)

	progress_text.write(progress_messages[2])
	progress_bar.progress(50)
	time.sleep(1.0)

	progress_text.write(progress_messages[3])
	progress_bar.progress(75)
	time.sleep(1.0)

	audio_bytes, transcript = generate_podcast(
	file,
	url,
	video_url,
	research_topic_input,
	tone,
	length_minutes,
	host_name,
	host_desc,
	guest_name,
	guest_desc,
	user_specs,
	sponsor_content,
	sponsor_style,
	custom_bg_music_path
	)

	progress_bar.progress(100)
	progress_text.write("✅ Done!")

	if audio_bytes is None:
	st.error(transcript)
	st.session_state["audio_bytes"] = None
	st.session_state["transcript"] = None
	st.session_state["transcript_original"] = None
	else:
	st.success("Podcast generated successfully!")
	st.session_state["audio_bytes"] = audio_bytes
	st.session_state["transcript"] = transcript
	st.session_state["transcript_original"] = transcript
	st.session_state["qa_count"] = 0
	st.session_state["conversation_history"] = ""

	if st.session_state["audio_bytes"]:
	st.audio(st.session_state["audio_bytes"], format='audio/mp3')
	st.download_button(
	label="Download Podcast (MP3)",
	data=st.session_state["audio_bytes"],
	file_name="my_podcast.mp3",
	mime="audio/mpeg"
	)

	st.markdown("### Generated Transcript (Editable)")
	edited_text = st.text_area(
	"Feel free to tweak lines, fix errors, or reword anything.",
	value=st.session_state["transcript"],
	height=300
	)

	from difflib import SequenceMatcher
	def highlight_differences(original: str, edited: str) -> str:
	matcher = SequenceMatcher(None, original.split(), edited.split())
	highlighted = []
	for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
	if opcode == 'equal':
	highlighted.extend(original.split()[i1:i2])
	elif opcode in ('replace', 'insert'):
	added_words = edited.split()[j1:j2]
	highlighted.extend([f'<span style="color:red">{word}</span>' for word in added_words])
	elif opcode == 'delete':
	pass
	return ' '.join(highlighted)

	if st.session_state["transcript_original"]:
	highlighted_transcript = highlight_differences(
	st.session_state["transcript_original"],
	edited_text
	)
	st.markdown("### Edited Transcript Highlights", unsafe_allow_html=True)
	st.markdown(highlighted_transcript, unsafe_allow_html=True)

	if st.button("Regenerate Audio From Edited Text"):
	regen_bar = st.progress(0)
	regen_text = st.empty()

	regen_text.write("🔄 Regenerating your podcast with the edits...")
	regen_bar.progress(25)
	time.sleep(1.0)

	regen_text.write("🔧 Adjusting the script based on your changes...")
	regen_bar.progress(50)
	time.sleep(1.0)

	dialogue_items = parse_user_edited_transcript(
	edited_text,
	host_name or "Jane",
	guest_name or "John"
	)
	new_audio_bytes, new_transcript = regenerate_audio_from_dialogue(dialogue_items, custom_bg_music_path)

	regen_bar.progress(75)
	time.sleep(1.0)

	if new_audio_bytes is None:
	regen_bar.progress(100)
	st.error(new_transcript)
	else:
	regen_bar.progress(100)
	regen_text.write("✅ Regeneration complete!")
	st.success("Regenerated audio below:")

	st.session_state["audio_bytes"] = new_audio_bytes
	st.session_state["transcript"] = new_transcript
	st.session_state["transcript_original"] = new_transcript

	st.audio(new_audio_bytes, format='audio/mp3')
	st.download_button(
	label="Download Edited Podcast (MP3)",
	data=new_audio_bytes,
	file_name="my_podcast_edited.mp3",
	mime="audio/mpeg"
	)
	st.markdown("### Updated Transcript")
	st.markdown(new_transcript)

	# ----------- POST-PODCAST Q&A with Microphone -----------
	st.markdown("## Post-Podcast Q&A (Using Microphone)")

	used_questions = st.session_state["qa_count"]
	remaining = MAX_QA_QUESTIONS - used_questions

	if remaining > 0:
	st.write(f"You can ask up to {remaining} more question(s).")

	st.write("### Record Your Follow-Up Question:")

	# EXPLICIT STUN SERVER
	# So we can confirm ICE candidates are gathered
	new_rtc_config = RTCConfiguration(
	{
	"iceServers": [
	{"urls": ["stun:stun.l.google.com:19302"]}
	]
	}
	)

	webrtc_ctx = webrtc_streamer(
	key="qna-audio-stream",
	mode=WebRtcMode.SENDONLY,
	rtc_configuration=new_rtc_config, # <--- STUN server explicitly set
	media_stream_constraints={"audio": True, "video": False},
	audio_processor_factory=AudioBufferProcessor
	)

	if "audio-processor" not in st.session_state:
	st.session_state["audio-processor"] = None

	# If the stream is currently playing, store the processor
	if webrtc_ctx.state.playing and webrtc_ctx.audio_processor:
	st.session_state["audio-processor"] = webrtc_ctx.audio_processor

	# Debug print: how many frames have arrived?
	st.write("Frames so far:", webrtc_ctx.audio_processor.frame_count)

	if not webrtc_ctx.state.playing:
	st.write("Recording Stopped. You may now submit your question.")

	if st.button("Submit Q&A"):
	if used_questions >= MAX_QA_QUESTIONS:
	st.warning("You have reached the Q&A limit.")
	else:
	processor = st.session_state.get("audio-processor")
	if not processor or not getattr(processor, "frames", None):
	st.warning("No recorded audio found. Please record your question first.")
	else:
	local_wav_path = processor.finalize_wav()
	if not local_wav_path:
	st.warning("No audio frames found. Please record again.")
	else:
	st.write("Transcribing your voice question via Deepgram...")
	question_text = transcribe_audio_deepgram(local_wav_path)
	if not question_text.strip():
	st.warning("No transcript found. Please try again.")
	else:
	st.write(f"You asked: {question_text}")

	conversation_so_far = st.session_state["conversation_history"]
	ans_audio, ans_text = handle_qa_exchange(conversation_so_far, question_text)
	if ans_audio:
	st.audio(ans_audio, format="audio/mp3")
	st.markdown(f"John: {ans_text}")
	st.session_state["qa_count"] += 1
	else:
	st.warning("No response could be generated.")
	else:
	st.write("You have used all 5 Q&A opportunities.")


	if __name__ == "__main__":
	main()