Spaces:

Bils
/

AIPromoStudio

Running on Zero

App Files Files Community

AIPromoStudio / app.py

Bils

Update app.py

8d064dc verified 4 months ago

raw

history blame

12.1 kB

	import gradio as gr
	import os
	import torch
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	pipeline,
	AutoProcessor,
	MusicgenForConditionalGeneration,
	)
	from scipy.io.wavfile import write
	from pydub import AudioSegment
	from dotenv import load_dotenv
	import tempfile
	import spaces
	from TTS.api import TTS

	# -------------------------------
	# Configuration
	# -------------------------------
	load_dotenv()
	HF_TOKEN = os.getenv("HF_TOKEN")

	MODEL_CONFIG = {
	"llama_models": {
	"Meta-Llama-3-8B": "meta-llama/Meta-Llama-3-8B-Instruct",
	"Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.2",
	"Phi-3-mini": "microsoft/Phi-3-mini-4k-instruct"
	},
	"tts_models": {
	"Standard English": "tts_models/en/ljspeech/tacotron2-DDC",
	"High Quality": "tts_models/en/ljspeech/vits",
	"Fast Inference": "tts_models/en/sam/tacotron-DDC"
	}
	}

	# -------------------------------
	# Model Manager
	# -------------------------------
	class ModelManager:
	def __init__(self):
	self.llama_pipelines = {}
	self.musicgen_models = {}
	self.tts_models = {}

	def get_llama_pipeline(self, model_id, token):
	if model_id not in self.llama_pipelines:
	tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	use_auth_token=token,
	torch_dtype=torch.float16,
	device_map="auto",
	attn_implementation="flash_attention_2"
	)
	self.llama_pipelines[model_id] = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	device_map="auto"
	)
	return self.llama_pipelines[model_id]

	def get_musicgen_model(self, model_key="facebook/musicgen-large"):
	if model_key not in self.musicgen_models:
	model = MusicgenForConditionalGeneration.from_pretrained(model_key)
	processor = AutoProcessor.from_pretrained(model_key)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)
	self.musicgen_models[model_key] = (model, processor)
	return self.musicgen_models[model_key]

	def get_tts_model(self, model_name):
	if model_name not in self.tts_models:
	self.tts_models[model_name] = TTS(model_name)
	return self.tts_models[model_name]

	model_manager = ModelManager()

	# -------------------------------
	# Core Functions
	# -------------------------------
	@spaces.GPU(duration=120)
	def generate_script(user_prompt, model_id, duration, temperature=0.7, max_tokens=512):
	try:
	text_pipeline = model_manager.get_llama_pipeline(model_id, HF_TOKEN)

	system_prompt = f"""You are an expert radio imaging producer. Create content for a {duration}-second promo:
	1. Voice Script: [Clear narration]
	2. Sound Design: [3-5 effects]
	3. Music: [Genre/tempo/mood]

	Respond in this exact format:"""

	prompt = f"{system_prompt}\nConcept: {user_prompt}\nVoice Script:"

	response = text_pipeline(
	prompt,
	max_new_tokens=max_tokens,
	temperature=temperature,
	do_sample=True,
	top_p=0.95,
	eos_token_id=text_pipeline.tokenizer.eos_token_id
	)

	return parse_generated_content(response[0]["generated_text"])
	except Exception as e:
	return f"Error: {str(e)}", "", ""

	def parse_generated_content(text):
	sections = {
	"Voice Script": "",
	"Sound Design": "",
	"Music": ""
	}
	current_section = None

	for line in text.split('\n'):
	line = line.strip()
	if "Voice Script:" in line:
	current_section = "Voice Script"
	line = line.replace("Voice Script:", "").strip()
	elif "Sound Design:" in line:
	current_section = "Sound Design"
	line = line.replace("Sound Design:", "").strip()
	elif "Music:" in line:
	current_section = "Music"
	line = line.replace("Music:", "").strip()

	if current_section and line:
	sections[current_section] += line + "\n"

	return sections["Voice Script"].strip(), sections["Sound Design"].strip(), sections["Music"].strip()

	@spaces.GPU(duration=100)
	def generate_voice(script, tts_model, speed=1.0):
	try:
	if not script.strip():
	return "Error: Empty script"

	tts = model_manager.get_tts_model(tts_model)
	output_path = os.path.join(tempfile.gettempdir(), "voice.wav")

	tts.tts_to_file(
	text=script,
	file_path=output_path,
	speed=speed
	)
	return output_path
	except Exception as e:
	return f"Error: {str(e)}"

	@spaces.GPU(duration=150)
	def generate_music(prompt, duration_sec=30, temperature=1.0, guidance_scale=3.0):
	try:
	model, processor = model_manager.get_musicgen_model()
	device = "cuda" if torch.cuda.is_available() else "cpu"

	inputs = processor(
	text=[prompt],
	padding=True,
	return_tensors="pt",
	).to(device)

	audio_values = model.generate(
	**inputs,
	max_new_tokens=int(duration_sec * 50),
	temperature=temperature,
	guidance_scale=guidance_scale,
	do_sample=True
	)

	output_path = os.path.join(tempfile.gettempdir(), "music.wav")
	write(output_path, 44100, audio_values[0, 0].cpu().numpy())
	return output_path
	except Exception as e:
	return f"Error: {str(e)}"

	def blend_audio(voice_path, music_path, ducking=True, duck_level=10, crossfade=500):
	try:
	voice = AudioSegment.from_wav(voice_path)
	music = AudioSegment.from_wav(music_path)

	# Align durations with crossfade
	if len(music) < len(voice):
	loops = (len(voice) // len(music)) + 1
	music = music * loops

	music = music[:len(voice)].fade_out(crossfade)
	voice = voice.fade_in(crossfade)

	# Apply ducking
	if ducking:
	ducked_music = music - duck_level
	mixed = ducked_music.overlay(voice)
	else:
	mixed = music.overlay(voice)

	output_path = os.path.join(tempfile.gettempdir(), "final_mix.wav")
	mixed.export(output_path, format="wav")
	return output_path
	except Exception as e:
	return f"Error: {str(e)}"

	# -------------------------------
	# Gradio Interface
	# -------------------------------
	theme = gr.themes.Soft(
	primary_hue="blue",
	secondary_hue="teal",
	).set(
	body_text_color_dark='#FFFFFF',
	background_fill_primary_dark='#1F1F1F'
	)

	with gr.Blocks(theme=theme, title="AI Radio Studio Pro") as demo:
	gr.Markdown("""
	# 🎧 AI Radio Studio Pro
	Professional Audio Production in 4 Steps
	""")

	with gr.Tabs():
	# Step 1: Concept Development
	with gr.Tab("1️⃣ Concept"):
	with gr.Row():
	with gr.Column(scale=2):
	concept_input = gr.Textbox(
	label="Your Idea",
	placeholder="e.g., A 30-second morning show intro with energetic music...",
	lines=3
	)
	with gr.Accordion("Advanced Settings", open=False):
	model_selector = gr.Dropdown(
	choices=list(MODEL_CONFIG["llama_models"].values()),
	label="AI Model",
	value=MODEL_CONFIG["llama_models"]["Meta-Llama-3-8B"]
	)
	duration_slider = gr.Slider(15, 120, 30, step=15, label="Duration (seconds)")
	temp_slider = gr.Slider(0.1, 1.5, 0.7, step=0.1, label="Creativity")
	generate_btn = gr.Button("Generate Script", variant="primary")

	with gr.Column(scale=1):
	script_output = gr.Textbox(label="Voice Script", interactive=True)
	sound_output = gr.Textbox(label="Sound Design", interactive=True)
	music_output = gr.Textbox(label="Music Style", interactive=True)

	# Step 2: Voice Production
	with gr.Tab("2️⃣ Voice"):
	with gr.Row():
	with gr.Column():
	tts_selector = gr.Dropdown(
	choices=list(MODEL_CONFIG["tts_models"].values()),
	label="Voice Model",
	value="tts_models/en/ljspeech/tacotron2-DDC"
	)
	speed_slider = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Speaking Rate")
	voice_btn = gr.Button("Generate Voiceover", variant="primary")
	with gr.Column():
	voice_preview = gr.Audio(label="Preview", type="filepath")

	# Step 3: Music Production
	with gr.Tab("3️⃣ Music"):
	with gr.Row():
	with gr.Column():
	music_duration = gr.Slider(10, 120, 30, label="Duration (seconds)")
	music_temp = gr.Slider(0.1, 2.0, 1.0, label="Creativity")
	guidance_scale = gr.Slider(1.0, 5.0, 3.0, label="Focus")
	music_btn = gr.Button("Generate Music", variant="primary")
	with gr.Column():
	music_preview = gr.Audio(label="Preview", type="filepath")

	# Step 4: Final Mix
	with gr.Tab("4️⃣ Mix"):
	with gr.Row():
	with gr.Column():
	ducking_toggle = gr.Checkbox(True, label="Enable Voice Ducking")
	duck_level = gr.Slider(0, 30, 12, label="Ducking Strength (dB)")
	crossfade_time = gr.Slider(0, 2000, 500, label="Crossfade (ms)")
	mix_btn = gr.Button("Create Final Mix", variant="primary")
	with gr.Column():
	final_mix = gr.Audio(label="Master Output", type="filepath")

	# Examples & Footer
	with gr.Accordion("💡 Example Prompts", open=False):
	gr.Examples(
	examples=[
	["A 45-second tech podcast intro with futuristic synth effects"],
	["A 15-second coffee shop radio ad with morning acoustic vibes"],
	["A 60-second documentary trailer with epic orchestral music"]
	],
	inputs=concept_input
	)

	gr.Markdown("""
	<div style="text-align: center; margin-top: 30px; padding-top: 20px; border-top: 1px solid #444;">
	<p style="font-size: 0.9em; color: #888;">
	Created with ❤️ by <a href="https://bilsimaging.com" target="_blank" style="color: #66b3ff;">bilsimaging.com</a>
	</p>
	<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold">
	<img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold&countColor=%23263759"/>
	</a>
	</div>
	""")

	# Event Handling
	generate_btn.click(
	generate_script,
	inputs=[concept_input, model_selector, duration_slider, temp_slider],
	outputs=[script_output, sound_output, music_output]
	)

	voice_btn.click(
	generate_voice,
	inputs=[script_output, tts_selector, speed_slider],
	outputs=voice_preview
	)

	music_btn.click(
	generate_music,
	inputs=[music_output, music_duration, music_temp, guidance_scale],
	outputs=music_preview
	)

	mix_btn.click(
	blend_audio,
	inputs=[voice_preview, music_preview, ducking_toggle, duck_level, crossfade_time],
	outputs=final_mix
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)