import os import re import torch import tempfile from scipy.io.wavfile import write from pydub import AudioSegment from dotenv import load_dotenv import spaces import gradio as gr # Transformers & Models from transformers import ( AutoTokenizer, AutoModelForCausalLM, pipeline, AutoProcessor, MusicgenForConditionalGeneration, ) # Coqui TTS from TTS.api import TTS # --------------------------------------------------------------------- # Load Environment Variables # --------------------------------------------------------------------- load_dotenv() HF_TOKEN = os.getenv("HF_TOKEN") # --------------------------------------------------------------------- # Global Model Caches # --------------------------------------------------------------------- LLAMA_PIPELINES = {} MUSICGEN_MODELS = {} TTS_MODELS = {} # --------------------------------------------------------------------- # Utility Function: Clean Text # --------------------------------------------------------------------- def clean_text(text: str) -> str: """ Removes undesired characters (e.g., asterisks) that might not be recognized by the model's vocabulary. """ return re.sub(r'\*', '', text) # --------------------------------------------------------------------- # Helper Functions # --------------------------------------------------------------------- def get_llama_pipeline(model_id: str, token: str): """ Returns a cached LLaMA pipeline if available; otherwise, loads it. """ if model_id in LLAMA_PIPELINES: return LLAMA_PIPELINES[model_id] tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token) model = AutoModelForCausalLM.from_pretrained( model_id, use_auth_token=token, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True, ) text_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer) LLAMA_PIPELINES[model_id] = text_pipeline return text_pipeline def get_musicgen_model(model_key: str = "facebook/musicgen-large"): """ Returns a cached MusicGen model if available; otherwise, loads it. Uses the 'large' variant for higher quality outputs. """ if model_key in MUSICGEN_MODELS: return MUSICGEN_MODELS[model_key] model = MusicgenForConditionalGeneration.from_pretrained(model_key) processor = AutoProcessor.from_pretrained(model_key) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) MUSICGEN_MODELS[model_key] = (model, processor) return model, processor def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"): """ Returns a cached TTS model if available; otherwise, loads it. """ if model_name in TTS_MODELS: return TTS_MODELS[model_name] tts_model = TTS(model_name) TTS_MODELS[model_name] = tts_model return tts_model # --------------------------------------------------------------------- # Script Generation Function # --------------------------------------------------------------------- @spaces.GPU(duration=100) def generate_script(user_prompt: str, model_id: str, token: str, duration: int): """ Generates a script, sound design suggestions, and music ideas from a user prompt. Returns a tuple of strings: (voice_script, sound_design, music_suggestions). """ try: text_pipeline = get_llama_pipeline(model_id, token) system_prompt = ( "You are an expert radio imaging producer specializing in sound design and music. " f"Based on the user's concept and the selected duration of {duration} seconds, produce the following: " "1. A concise voice-over script. Prefix this section with 'Voice-Over Script:'.\n" "2. Suggestions for sound design. Prefix this section with 'Sound Design Suggestions:'.\n" "3. Music styles or track recommendations. Prefix this section with 'Music Suggestions:'." ) combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nOutput:" with torch.inference_mode(): result = text_pipeline( combined_prompt, max_new_tokens=300, do_sample=True, temperature=0.8 ) generated_text = result[0]["generated_text"] if "Output:" in generated_text: generated_text = generated_text.split("Output:")[-1].strip() # Default placeholders voice_script = "No voice-over script found." sound_design = "No sound design suggestions found." music_suggestions = "No music suggestions found." # Voice-Over Script if "Voice-Over Script:" in generated_text: parts = generated_text.split("Voice-Over Script:") voice_script_part = parts[1] if "Sound Design Suggestions:" in voice_script_part: voice_script = voice_script_part.split("Sound Design Suggestions:")[0].strip() else: voice_script = voice_script_part.strip() # Sound Design if "Sound Design Suggestions:" in generated_text: parts = generated_text.split("Sound Design Suggestions:") sound_design_part = parts[1] if "Music Suggestions:" in sound_design_part: sound_design = sound_design_part.split("Music Suggestions:")[0].strip() else: sound_design = sound_design_part.strip() # Music Suggestions if "Music Suggestions:" in generated_text: parts = generated_text.split("Music Suggestions:") music_suggestions = parts[1].strip() return voice_script, sound_design, music_suggestions except Exception as e: return f"Error generating script: {e}", "", "" # --------------------------------------------------------------------- # Ad Promo Idea Generation Function # --------------------------------------------------------------------- @spaces.GPU(duration=100) def generate_ad_promo_idea(user_prompt: str, model_id: str, token: str): """ Generates a creative ad promo idea based on the user's concept. Returns a string containing the ad promo idea. """ try: text_pipeline = get_llama_pipeline(model_id, token) system_prompt = ( "You are a creative advertising strategist. " "Generate a unique and engaging ad promo idea based on the following concept. " "Include creative angles, potential taglines, and media suggestions." ) combined_prompt = f"{system_prompt}\nConcept: {user_prompt}\nAd Promo Idea:" with torch.inference_mode(): result = text_pipeline( combined_prompt, max_new_tokens=150, do_sample=True, temperature=0.8 ) generated_text = result[0]["generated_text"] if "Ad Promo Idea:" in generated_text: generated_text = generated_text.split("Ad Promo Idea:")[-1].strip() return generated_text except Exception as e: return f"Error generating ad promo idea: {e}" # --------------------------------------------------------------------- # Voice-Over Generation Function # --------------------------------------------------------------------- @spaces.GPU(duration=100) def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"): """ Generates a voice-over from the provided script using the Coqui TTS model. Returns the file path to the generated .wav file. """ try: if not script.strip(): return "Error: No script provided." cleaned_script = clean_text(script) tts_model = get_tts_model(tts_model_name) output_path = os.path.join(tempfile.gettempdir(), "voice_over.wav") tts_model.tts_to_file(text=cleaned_script, file_path=output_path) return output_path except Exception as e: return f"Error generating voice: {e}" # --------------------------------------------------------------------- # Music Generation Function # --------------------------------------------------------------------- @spaces.GPU(duration=200) def generate_music(prompt: str, audio_length: int): """ Generates music from the 'facebook/musicgen-large' model based on the prompt. Returns the file path to the generated .wav file. """ try: if not prompt.strip(): return "Error: No music suggestion provided." model_key = "facebook/musicgen-large" musicgen_model, musicgen_processor = get_musicgen_model(model_key) device = "cuda" if torch.cuda.is_available() else "cpu" inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt").to(device) with torch.inference_mode(): outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length) audio_data = outputs[0, 0].cpu().numpy() normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16") output_path = os.path.join(tempfile.gettempdir(), "musicgen_large_generated_music.wav") write(output_path, 44100, normalized_audio) return output_path except Exception as e: return f"Error generating music: {e}" # --------------------------------------------------------------------- # Audio Blending with Duration Sync & Ducking # --------------------------------------------------------------------- @spaces.GPU(duration=100) def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int = 10): """ Blends two audio files (voice and music). Returns the file path to the blended .wav file. """ try: if not os.path.isfile(voice_path) or not os.path.isfile(music_path): return "Error: Missing audio files for blending." voice = AudioSegment.from_wav(voice_path) music = AudioSegment.from_wav(music_path) voice_len = len(voice) music_len = len(music) if music_len < voice_len: looped_music = AudioSegment.empty() while len(looped_music) < voice_len: looped_music += music music = looped_music if len(music) > voice_len: music = music[:voice_len] if ducking: ducked_music = music - duck_level final_audio = ducked_music.overlay(voice) else: final_audio = music.overlay(voice) output_path = os.path.join(tempfile.gettempdir(), "blended_output.wav") final_audio.export(output_path, format="wav") return output_path except Exception as e: return f"Error blending audio: {e}" # --------------------------------------------------------------------- # Gradio Interface with Enhanced UI # --------------------------------------------------------------------- with gr.Blocks(css=""" /* Global Styles */ body { background: linear-gradient(135deg, #1d1f21, #3a3d41); color: #f0f0f0; font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; } .header { text-align: center; padding: 2rem 1rem; background: linear-gradient(90deg, #6a11cb, #2575fc); border-radius: 0 0 20px 20px; margin-bottom: 2rem; } .header h1 { margin: 0; font-size: 2.5rem; } .header p { font-size: 1.2rem; } .gradio-container { background: #2e2e2e; border-radius: 10px; padding: 1rem; } .tab-title { font-size: 1.1rem; font-weight: bold; } .footer { text-align: center; font-size: 0.9em; margin-top: 2rem; padding: 1rem; color: #cccccc; } """) as demo: # Custom Header with gr.Row(elem_classes="header"): gr.Markdown("""
Your all-in-one AI solution for crafting engaging audio ads.
Demo MVP