import gradio as gr import os import torch from transformers import ( AutoTokenizer, AutoModelForCausalLM, pipeline, AutoProcessor, MusicgenForConditionalGeneration, ) from scipy.io.wavfile import write from pydub import AudioSegment from dotenv import load_dotenv import tempfile import spaces from TTS.api import TTS # ------------------------------- # Configuration # ------------------------------- load_dotenv() HF_TOKEN = os.getenv("HF_TOKEN") MODEL_CONFIG = { "llama_models": { "Meta-Llama-3-8B": "meta-llama/Meta-Llama-3-8B-Instruct", "Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.2", "Phi-3-mini": "microsoft/Phi-3-mini-4k-instruct" }, "tts_models": { "Standard English": "tts_models/en/ljspeech/tacotron2-DDC", "High Quality": "tts_models/en/ljspeech/vits", "Fast Inference": "tts_models/en/sam/tacotron-DDC" } } # ------------------------------- # Model Manager # ------------------------------- class ModelManager: def __init__(self): self.llama_pipelines = {} self.musicgen_models = {} self.tts_models = {} def get_llama_pipeline(self, model_id, token): if model_id not in self.llama_pipelines: tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token) model = AutoModelForCausalLM.from_pretrained( model_id, use_auth_token=token, torch_dtype=torch.float16, device_map="auto", attn_implementation="flash_attention_2" ) self.llama_pipelines[model_id] = pipeline( "text-generation", model=model, tokenizer=tokenizer, device_map="auto" ) return self.llama_pipelines[model_id] def get_musicgen_model(self, model_key="facebook/musicgen-large"): if model_key not in self.musicgen_models: model = MusicgenForConditionalGeneration.from_pretrained(model_key) processor = AutoProcessor.from_pretrained(model_key) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) self.musicgen_models[model_key] = (model, processor) return self.musicgen_models[model_key] def get_tts_model(self, model_name): if model_name not in self.tts_models: self.tts_models[model_name] = TTS(model_name) return self.tts_models[model_name] model_manager = ModelManager() # ------------------------------- # Core Functions # ------------------------------- @spaces.GPU(duration=120) def generate_script(user_prompt, model_id, duration, temperature=0.7, max_tokens=512): try: text_pipeline = model_manager.get_llama_pipeline(model_id, HF_TOKEN) system_prompt = f"""You are an expert radio imaging producer. Create content for a {duration}-second promo: 1. Voice Script: [Clear narration] 2. Sound Design: [3-5 effects] 3. Music: [Genre/tempo/mood] Respond in this exact format:""" prompt = f"{system_prompt}\nConcept: {user_prompt}\nVoice Script:" response = text_pipeline( prompt, max_new_tokens=max_tokens, temperature=temperature, do_sample=True, top_p=0.95, eos_token_id=text_pipeline.tokenizer.eos_token_id ) return parse_generated_content(response[0]["generated_text"]) except Exception as e: return f"Error: {str(e)}", "", "" def parse_generated_content(text): sections = { "Voice Script": "", "Sound Design": "", "Music": "" } current_section = None for line in text.split('\n'): line = line.strip() if "Voice Script:" in line: current_section = "Voice Script" line = line.replace("Voice Script:", "").strip() elif "Sound Design:" in line: current_section = "Sound Design" line = line.replace("Sound Design:", "").strip() elif "Music:" in line: current_section = "Music" line = line.replace("Music:", "").strip() if current_section and line: sections[current_section] += line + "\n" return sections["Voice Script"].strip(), sections["Sound Design"].strip(), sections["Music"].strip() @spaces.GPU(duration=100) def generate_voice(script, tts_model, speed=1.0): try: if not script.strip(): return "Error: Empty script" tts = model_manager.get_tts_model(tts_model) output_path = os.path.join(tempfile.gettempdir(), "voice.wav") tts.tts_to_file( text=script, file_path=output_path, speed=speed ) return output_path except Exception as e: return f"Error: {str(e)}" @spaces.GPU(duration=150) def generate_music(prompt, duration_sec=30, temperature=1.0, guidance_scale=3.0): try: model, processor = model_manager.get_musicgen_model() device = "cuda" if torch.cuda.is_available() else "cpu" inputs = processor( text=[prompt], padding=True, return_tensors="pt", ).to(device) audio_values = model.generate( **inputs, max_new_tokens=int(duration_sec * 50), temperature=temperature, guidance_scale=guidance_scale, do_sample=True ) output_path = os.path.join(tempfile.gettempdir(), "music.wav") write(output_path, 44100, audio_values[0, 0].cpu().numpy()) return output_path except Exception as e: return f"Error: {str(e)}" def blend_audio(voice_path, music_path, ducking=True, duck_level=10, crossfade=500): try: voice = AudioSegment.from_wav(voice_path) music = AudioSegment.from_wav(music_path) # Align durations with crossfade if len(music) < len(voice): loops = (len(voice) // len(music)) + 1 music = music * loops music = music[:len(voice)].fade_out(crossfade) voice = voice.fade_in(crossfade) # Apply ducking if ducking: ducked_music = music - duck_level mixed = ducked_music.overlay(voice) else: mixed = music.overlay(voice) output_path = os.path.join(tempfile.gettempdir(), "final_mix.wav") mixed.export(output_path, format="wav") return output_path except Exception as e: return f"Error: {str(e)}" # ------------------------------- # Gradio Interface # ------------------------------- theme = gr.themes.Soft( primary_hue="blue", secondary_hue="teal", ).set( body_text_color_dark='#FFFFFF', background_fill_primary_dark='#1F1F1F' ) with gr.Blocks(theme=theme, title="AI Radio Studio Pro") as demo: gr.Markdown(""" # 🎧 AI Radio Studio Pro *Professional Audio Production in 4 Steps* """) with gr.Tabs(): # Step 1: Concept Development with gr.Tab("1️⃣ Concept"): with gr.Row(): with gr.Column(scale=2): concept_input = gr.Textbox( label="Your Idea", placeholder="e.g., A 30-second morning show intro with energetic music...", lines=3 ) with gr.Accordion("Advanced Settings", open=False): model_selector = gr.Dropdown( choices=list(MODEL_CONFIG["llama_models"].values()), label="AI Model", value=MODEL_CONFIG["llama_models"]["Meta-Llama-3-8B"] ) duration_slider = gr.Slider(15, 120, 30, step=15, label="Duration (seconds)") temp_slider = gr.Slider(0.1, 1.5, 0.7, step=0.1, label="Creativity") generate_btn = gr.Button("Generate Script", variant="primary") with gr.Column(scale=1): script_output = gr.Textbox(label="Voice Script", interactive=True) sound_output = gr.Textbox(label="Sound Design", interactive=True) music_output = gr.Textbox(label="Music Style", interactive=True) # Step 2: Voice Production with gr.Tab("2️⃣ Voice"): with gr.Row(): with gr.Column(): tts_selector = gr.Dropdown( choices=list(MODEL_CONFIG["tts_models"].values()), label="Voice Model", value="tts_models/en/ljspeech/tacotron2-DDC" ) speed_slider = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Speaking Rate") voice_btn = gr.Button("Generate Voiceover", variant="primary") with gr.Column(): voice_preview = gr.Audio(label="Preview", type="filepath") # Step 3: Music Production with gr.Tab("3️⃣ Music"): with gr.Row(): with gr.Column(): music_duration = gr.Slider(10, 120, 30, label="Duration (seconds)") music_temp = gr.Slider(0.1, 2.0, 1.0, label="Creativity") guidance_scale = gr.Slider(1.0, 5.0, 3.0, label="Focus") music_btn = gr.Button("Generate Music", variant="primary") with gr.Column(): music_preview = gr.Audio(label="Preview", type="filepath") # Step 4: Final Mix with gr.Tab("4️⃣ Mix"): with gr.Row(): with gr.Column(): ducking_toggle = gr.Checkbox(True, label="Enable Voice Ducking") duck_level = gr.Slider(0, 30, 12, label="Ducking Strength (dB)") crossfade_time = gr.Slider(0, 2000, 500, label="Crossfade (ms)") mix_btn = gr.Button("Create Final Mix", variant="primary") with gr.Column(): final_mix = gr.Audio(label="Master Output", type="filepath") # Examples & Footer with gr.Accordion("💡 Example Prompts", open=False): gr.Examples( examples=[ ["A 45-second tech podcast intro with futuristic synth effects"], ["A 15-second coffee shop radio ad with morning acoustic vibes"], ["A 60-second documentary trailer with epic orchestral music"] ], inputs=concept_input ) gr.Markdown("""
Created with ❤️ by bilsimaging.com