Spaces:

Bils
/

AIPromoStudio

Running on Zero

App Files Files Community

AIPromoStudio / app.py

Bils

Update app.py

019c404 verified 4 months ago

raw

history blame

7.92 kB

	import gradio as gr
	import os
	import torch
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	pipeline,
	AutoProcessor,
	MusicgenForConditionalGeneration,
	)
	from scipy.io.wavfile import write
	from pydub import AudioSegment
	from dotenv import load_dotenv
	import tempfile
	import spaces
	from TTS.api import TTS

	# Load environment variables
	load_dotenv()
	hf_token = os.getenv("HF_TOKEN")

	# ---------------------------------------------------------------------
	# Script Generation Function
	# ---------------------------------------------------------------------
	@spaces.GPU(duration=300)
	def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
	try:
	tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	use_auth_token=token,
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True,
	)
	llama_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

	system_prompt = (
	f"You are an expert radio imaging producer specializing in sound design and music. "
	f"Based on the user's concept and the selected duration of {duration} seconds, craft a concise, engaging promo script. "
	f"Ensure the script fits within the time limit and suggest a matching music style that complements the theme."
	)

	combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nRefined script and music suggestion:"
	result = llama_pipeline(combined_prompt, max_new_tokens=200, do_sample=True, temperature=0.9)

	generated_text = result[0]["generated_text"].split("Refined script and music suggestion:")[-1].strip()
	if "Music Suggestion:" in generated_text:
	script, music_suggestion = generated_text.split("Music Suggestion:")
	return script.strip(), music_suggestion.strip()
	return generated_text, "No specific music suggestion found."
	except Exception as e:
	return f"Error generating script: {e}", None

	# ---------------------------------------------------------------------
	# Voice-Over Generation Function
	# ---------------------------------------------------------------------
	@spaces.GPU(duration=300)
	def generate_voice(script: str, speaker: str = "default"):
	try:
	# Load the TTS model
	tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", gpu=torch.cuda.is_available())

	# Generate the speech audio file
	output_path = f"{tempfile.gettempdir()}/generated_voice.wav"
	tts.tts_to_file(text=script, file_path=output_path, speaker=speaker)

	return output_path
	except Exception as e:
	return f"Error generating voice-over: {e}"

	# ---------------------------------------------------------------------
	# Music Generation Function
	# ---------------------------------------------------------------------
	@spaces.GPU(duration=300)
	def generate_music(prompt: str, audio_length: int):
	try:
	musicgen_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
	musicgen_processor = AutoProcessor.from_pretrained("facebook/musicgen-small")

	device = "cuda" if torch.cuda.is_available() else "cpu"
	musicgen_model.to(device)

	inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt").to(device)
	outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length)

	audio_data = outputs[0, 0].cpu().numpy()
	normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")

	output_path = f"{tempfile.gettempdir()}/generated_music.wav"
	write(output_path, 44100, normalized_audio)

	return output_path
	except Exception as e:
	return f"Error generating music: {e}"

	# ---------------------------------------------------------------------
	# Audio Blending Function with Ducking
	# ---------------------------------------------------------------------
	def blend_audio(voice_path: str, music_path: str, ducking: bool):
	try:
	voice = AudioSegment.from_file(voice_path)
	music = AudioSegment.from_file(music_path)

	if ducking:
	music = music - 10 # Lower music volume for ducking

	combined = music.overlay(voice)
	output_path = f"{tempfile.gettempdir()}/final_promo.wav"
	combined.export(output_path, format="wav")

	return output_path
	except Exception as e:
	return f"Error blending audio: {e}"

	# ---------------------------------------------------------------------
	# Gradio Interface
	# ---------------------------------------------------------------------
	with gr.Blocks() as demo:
	gr.Markdown("""
	# 🎧 AI Promo Studio with Step-by-Step Script, Voice, Music, and Mixing 🚀
	Generate and mix radio promos effortlessly with AI tools!
	""")

	with gr.Tabs():
	with gr.Tab("Step 1: Generate Script"):
	with gr.Row():
	user_prompt = gr.Textbox(label="Promo Idea", placeholder="E.g., A 30-second promo for a morning show.")
	llama_model_id = gr.Textbox(label="Llama Model ID", value="meta-llama/Meta-Llama-3-8B-Instruct")
	duration = gr.Slider(label="Duration (seconds)", minimum=15, maximum=60, step=15, value=30)

	generate_script_button = gr.Button("Generate Script")
	script_output = gr.Textbox(label="Generated Script")
	music_suggestion_output = gr.Textbox(label="Music Suggestion")

	generate_script_button.click(
	fn=lambda user_prompt, model_id, duration: generate_script(user_prompt, model_id, hf_token, duration),
	inputs=[user_prompt, llama_model_id, duration],
	outputs=[script_output, music_suggestion_output],
	)

	with gr.Tab("Step 2: Generate Voice"):
	with gr.Row():
	speaker = gr.Textbox(label="Voice Style (optional)", placeholder="E.g., male, female, or neutral.")

	generate_voice_button = gr.Button("Generate Voice")
	voice_output = gr.Audio(label="Generated Voice", type="filepath")

	generate_voice_button.click(
	fn=lambda script, speaker: generate_voice(script, speaker),
	inputs=[script_output, speaker],
	outputs=[voice_output],
	)

	with gr.Tab("Step 3: Generate Music"):
	with gr.Row():
	audio_length = gr.Slider(label="Music Length (tokens)", minimum=128, maximum=1024, step=64, value=512)

	generate_music_button = gr.Button("Generate Music")
	music_output = gr.Audio(label="Generated Music", type="filepath")

	generate_music_button.click(
	fn=lambda music_suggestion, audio_length: generate_music(music_suggestion, audio_length),
	inputs=[music_suggestion_output, audio_length],
	outputs=[music_output],
	)

	with gr.Tab("Step 4: Blend Audio"):
	with gr.Row():
	ducking = gr.Checkbox(label="Enable Ducking", value=True)

	blend_button = gr.Button("Blend Audio")
	final_output = gr.Audio(label="Final Promo Audio", type="filepath")

	blend_button.click(
	fn=lambda voice_path, music_path, ducking: blend_audio(voice_path, music_path, ducking),
	inputs=[voice_output, music_output, ducking],
	outputs=[final_output],
	)

	gr.Markdown("""
	<hr>
	<p style="text-align: center; font-size: 0.9em;">
	Created with ❤️ by <a href="https://bilsimaging.com" target="_blank">bilsimaging.com</a>
	</p>
	""")

	demo.launch(debug=True)