Spaces:

Bils
/

AIPromoStudio

Running on Zero

App Files Files Community

Bils commited on Jan 30

Commit

7b531cd

verified ·

1 Parent(s): 1a03830

Update app.py

Browse files

Files changed (1) hide show

app.py +184 -102

app.py CHANGED Viewed

@@ -25,10 +25,12 @@ MODEL_CONFIG = {
     "llama_models": {
         "Meta-Llama-3-8B": "meta-llama/Meta-Llama-3-8B-Instruct",
         "Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.2",
     },
     "tts_models": {
         "Standard English": "tts_models/en/ljspeech/tacotron2-DDC",
         "High Quality": "tts_models/en/ljspeech/vits",
     }
 }
@@ -43,17 +45,19 @@ class ModelManager:
     def get_llama_pipeline(self, model_id, token):
         if model_id not in self.llama_pipelines:
-            tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
             model = AutoModelForCausalLM.from_pretrained(
                 model_id,
-                use_auth_token=token,
                 torch_dtype=torch.float16,
-                device_map="auto"
             )
             self.llama_pipelines[model_id] = pipeline(
                 "text-generation",
                 model=model,
-                tokenizer=tokenizer
             )
         return self.llama_pipelines[model_id]
@@ -61,6 +65,8 @@ class ModelManager:
         if model_key not in self.musicgen_models:
             model = MusicgenForConditionalGeneration.from_pretrained(model_key)
             processor = AutoProcessor.from_pretrained(model_key)
             self.musicgen_models[model_key] = (model, processor)
         return self.musicgen_models[model_key]
@@ -74,26 +80,34 @@ model_manager = ModelManager()
 # -------------------------------
 # Core Functions
 # -------------------------------
-@spaces.GPU
-def generate_script(user_prompt, model_id, duration, temperature=0.7):
     try:
         text_pipeline = model_manager.get_llama_pipeline(model_id, HF_TOKEN)
-        prompt = f"""Create a {duration}-second audio promo script with these elements:
-1. Voice Script: [clear narration]
-2. Sound Design: [3-5 effects]
-3. Music: [genre/tempo]
-Concept: {user_prompt}"""
-        result = text_pipeline(
-            prompt,
-            max_new_tokens=300,
             temperature=temperature,
-            do_sample=True
         )
-        return parse_generated_content(result[0]["generated_text"])
     except Exception as e:
         return f"Error: {str(e)}", "", ""
@@ -122,48 +136,68 @@ def parse_generated_content(text):
     return sections["Voice Script"].strip(), sections["Sound Design"].strip(), sections["Music"].strip()
-@spaces.GPU
 def generate_voice(script, tts_model, speed=1.0):
     try:
         if not script.strip():
-            return "Error: No script provided"
         tts = model_manager.get_tts_model(tts_model)
-        output_path = os.path.join(tempfile.gettempdir(), "voice.wav")
-        tts.tts_to_file(text=script, file_path=output_path)
         return output_path
     except Exception as e:
         return f"Error: {str(e)}"
-@spaces.GPU
-def generate_music(prompt, duration_sec=30):
     try:
         model, processor = model_manager.get_musicgen_model()
-        inputs = processor(text=[prompt], padding=True, return_tensors="pt")
-        audio_values = model.generate(**inputs, max_new_tokens=int(duration_sec * 50))
-        output_path = os.path.join(tempfile.gettempdir(), "music.wav")
-        write(output_path, 44100, audio_values[0, 0].cpu().numpy())
         return output_path
     except Exception as e:
         return f"Error: {str(e)}"
-def blend_audio(voice_path, music_path, ducking=True, duck_level=10):
     try:
         voice = AudioSegment.from_wav(voice_path)
         music = AudioSegment.from_wav(music_path)
-        # Align durations
         if len(music) < len(voice):
-            music = music * (len(voice) // len(music) + 1)
-        music = music[:len(voice)]
-        # Apply ducking
         if ducking:
-            music = music - duck_level
-        mixed = music.overlay(voice)
-        output_path = os.path.join(tempfile.gettempdir(), "final_mix.wav")
         mixed.export(output_path, format="wav")
         return output_path
     except Exception as e:
@@ -172,84 +206,132 @@ def blend_audio(voice_path, music_path, ducking=True, duck_level=10):
 # -------------------------------
 # Gradio Interface
 # -------------------------------
-with gr.Blocks(title="AI Radio Studio", css=".gradio-container {max-width: 800px !important}") as demo:
     gr.Markdown("""
-    # 🎙️ AI Radio Studio
-    *Create professional audio content in 4 easy steps*
     """)
     with gr.Tabs():
-        with gr.Tab("1️⃣ Concept"):
-            concept_input = gr.Textbox(label="Your Idea", placeholder="Describe your radio promo...", lines=3)
             with gr.Row():
-                model_select = gr.Dropdown(
-                    choices=list(MODEL_CONFIG["llama_models"].values()),
-                    label="AI Model",
-                    value="meta-llama/Meta-Llama-3-8B-Instruct"
-                )
-                duration_select = gr.Slider(15, 60, 30, step=15, label="Duration (sec)")
-            generate_btn = gr.Button("Generate Script", variant="primary")
-            script_output = gr.Textbox(label="Voice Script", interactive=True)
-            sound_output = gr.Textbox(label="Sound Effects", interactive=True)
-            music_output = gr.Textbox(label="Music Style", interactive=True)
-        with gr.Tab("2️⃣ Voice"):
-            tts_select = gr.Dropdown(
-                choices=list(MODEL_CONFIG["tts_models"].values()),
-                label="Voice Model",
-                value="tts_models/en/ljspeech/tacotron2-DDC"
             )
-            voice_btn = gr.Button("Generate Voiceover", variant="primary")
-            voice_preview = gr.Audio(label="Preview", type="filepath")
-        with gr.Tab("3️⃣ Music"):
-            music_btn = gr.Button("Generate Music", variant="primary")
-            music_preview = gr.Audio(label="Preview", type="filepath")
-        with gr.Tab("4️⃣ Mix"):
             with gr.Row():
-                ducking_toggle = gr.Checkbox(True, label="Duck Music")
-                duck_level = gr.Slider(0, 20, 10, label="Duck Level (dB)")
-            mix_btn = gr.Button("Create Final Mix", variant="primary")
-            final_mix = gr.Audio(label="Final Output", type="filepath")
-    # Footer Section
-    gr.Markdown("""
-    <div style="text-align: center; margin-top: 20px; padding: 15px; border-top: 1px solid #e0e0e0;">
-        <p style="font-size: 0.9em; color: #666;">
-            Created with ❤️ by <a href="https://bilsimaging.com" target="_blank">bilsimaging.com</a>
-        </p>
-        <a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/radiogold">
-            <img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold&countColor=%23263759"/>
-        </a>
-    </div>
-    """)
-    # Event Handlers
-    generate_btn.click(
-        generate_script,
-        inputs=[concept_input, model_select, duration_select],
-        outputs=[script_output, sound_output, music_output]
-    )
-    voice_btn.click(
-        generate_voice,
-        inputs=[script_output, tts_select],
-        outputs=voice_preview
-    )
-    music_btn.click(
-        generate_music,
-        inputs=[music_output],
-        outputs=music_preview
-    )
-    mix_btn.click(
-        blend_audio,
-        inputs=[voice_preview, music_preview, ducking_toggle, duck_level],
-        outputs=final_mix
-    )
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

     "llama_models": {
         "Meta-Llama-3-8B": "meta-llama/Meta-Llama-3-8B-Instruct",
         "Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.2",
+        "Phi-3-mini": "microsoft/Phi-3-mini-4k-instruct"
     },
     "tts_models": {
         "Standard English": "tts_models/en/ljspeech/tacotron2-DDC",
         "High Quality": "tts_models/en/ljspeech/vits",
+        "Fast Inference": "tts_models/en/sam/tacotron-DDC"
     }
 }
     def get_llama_pipeline(self, model_id, token):
         if model_id not in self.llama_pipelines:
+            tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
             model = AutoModelForCausalLM.from_pretrained(
                 model_id,
+                token=token,
                 torch_dtype=torch.float16,
+                device_map="auto",
+                attn_implementation="flash_attention_2"
             )
             self.llama_pipelines[model_id] = pipeline(
                 "text-generation",
                 model=model,
+                tokenizer=tokenizer,
+                device_map="auto"
             )
         return self.llama_pipelines[model_id]
         if model_key not in self.musicgen_models:
             model = MusicgenForConditionalGeneration.from_pretrained(model_key)
             processor = AutoProcessor.from_pretrained(model_key)
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            model.to(device)
             self.musicgen_models[model_key] = (model, processor)
         return self.musicgen_models[model_key]
 # -------------------------------
 # Core Functions
 # -------------------------------
+@spaces.GPU(duration=120)
+def generate_script(user_prompt, model_id, duration, temperature=0.7, max_tokens=512):
     try:
         text_pipeline = model_manager.get_llama_pipeline(model_id, HF_TOKEN)
+        system_prompt = f"""You are an AI audio production assistant. Create content for a {duration}-second promo:
+1. Voice Script: [Clear, engaging narration]
+2. Sound Design: [3-5 specific sound effects]
+3. Music: [Genre, tempo, mood suggestions]
+Keep sections concise and production-ready."""
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt}
+        ]
+        response = text_pipeline(
+            messages,
+            max_new_tokens=max_tokens,
             temperature=temperature,
+            do_sample=True,
+            top_p=0.95,
+            eos_token_id=text_pipeline.tokenizer.eos_token_id
         )
+        return parse_generated_content(response[0]['generated_text'][-1]['content'])
     except Exception as e:
         return f"Error: {str(e)}", "", ""
     return sections["Voice Script"].strip(), sections["Sound Design"].strip(), sections["Music"].strip()
+@spaces.GPU(duration=100)
 def generate_voice(script, tts_model, speed=1.0):
     try:
         if not script.strip():
+            raise ValueError("Empty script")
         tts = model_manager.get_tts_model(tts_model)
+        output_path = os.path.join(tempfile.gettempdir(), "enhanced_voice.wav")
+        tts.tts_to_file(
+            text=script,
+            file_path=output_path,
+            speed=speed
+        )
         return output_path
     except Exception as e:
         return f"Error: {str(e)}"
+@spaces.GPU(duration=150)
+def generate_music(prompt, duration_sec=30, temperature=1.0, guidance_scale=3.0):
     try:
         model, processor = model_manager.get_musicgen_model()
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        inputs = processor(
+            text=[prompt],
+            padding=True,
+            return_tensors="pt",
+        ).to(device)
+        audio_values = model.generate(
+            **inputs,
+            max_new_tokens=int(duration_sec * 50),
+            temperature=temperature,
+            guidance_scale=guidance_scale,
+            do_sample=True
+        )
+        output_path = os.path.join(tempfile.gettempdir(), "enhanced_music.wav")
+        write(output_path, 32000, audio_values[0, 0].cpu().numpy())
         return output_path
     except Exception as e:
         return f"Error: {str(e)}"
+def blend_audio(voice_path, music_path, ducking=True, duck_level=10, crossfade=500):
     try:
         voice = AudioSegment.from_wav(voice_path)
         music = AudioSegment.from_wav(music_path)
         if len(music) < len(voice):
+            loops = (len(voice) // len(music)) + 1
+            music = music * loops
+        music = music[:len(voice)].fade_out(crossfade)
         if ducking:
+            ducked_music = music - duck_level
+            mixed = ducked_music.overlay(voice.fade_in(crossfade))
+        else:
+            mixed = music.overlay(voice)
+        output_path = os.path.join(tempfile.gettempdir(), "enhanced_mix.wav")
         mixed.export(output_path, format="wav")
         return output_path
     except Exception as e:
 # -------------------------------
 # Gradio Interface
 # -------------------------------
+theme = gr.themes.Soft(
+    primary_hue="blue",
+    secondary_hue="teal",
+).set(
+    body_text_color_dark='#FFFFFF',
+    background_fill_primary_dark='#1F1F1F'
+)
+with gr.Blocks(theme=theme, title="AI Audio Studio Pro") as demo:
     gr.Markdown("""
+    # 🎙️ AI Audio Studio Pro
+    *Next-generation audio production powered by AI*
     """)
     with gr.Tabs():
+        with gr.Tab("🎯 Concept Development"):
             with gr.Row():
+                with gr.Column(scale=2):
+                    concept_input = gr.Textbox(
+                        label="Your Concept",
+                        placeholder="Describe your audio project...",
+                        lines=3,
+                        max_lines=6
+                    )
+                    with gr.Accordion("Advanced Settings", open=False):
+                        with gr.Row():
+                            model_selector = gr.Dropdown(
+                                choices=list(MODEL_CONFIG["llama_models"].values()),
+                                label="AI Model",
+                                value=MODEL_CONFIG["llama_models"]["Meta-Llama-3-8B"]
+                            )
+                            duration_slider = gr.Slider(15, 120, value=30, step=15, label="Duration (seconds)")
+                        with gr.Row():
+                            temp_slider = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Creativity")
+                            token_slider = gr.Slider(128, 1024, value=512, step=128, label="Max Length")
+                    generate_btn = gr.Button("✨ Generate Concept", variant="primary")
+                with gr.Column(scale=1):
+                    script_output = gr.Textbox(label="Voice Script", interactive=True)
+                    sound_output = gr.Textbox(label="Sound Design", interactive=True)
+                    music_output = gr.Textbox(label="Music Suggestions", interactive=True)
+            generate_btn.click(
+                generate_script,
+                inputs=[concept_input, model_selector, duration_slider, temp_slider, token_slider],
+                outputs=[script_output, sound_output, music_output]
             )
+        with gr.Tab("🗣️ Voice Production"):
             with gr.Row():
+                with gr.Column():
+                    tts_model = gr.Dropdown(
+                        choices=list(MODEL_CONFIG["tts_models"].values()),
+                        label="Voice Model",
+                        value=MODEL_CONFIG["tts_models"]["Standard English"]
+                    )
+                    speed_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speaking Rate")
+                    voice_btn = gr.Button("🎙️ Generate Voiceover", variant="primary")
+                with gr.Column():
+                    voice_preview = gr.Audio(label="Preview", interactive=False)
+                    voice_btn.click(
+                        generate_voice,
+                        inputs=[script_output, tts_model, speed_slider],
+                        outputs=voice_preview
+                    )
+        with gr.Tab("🎶 Music Production"):
+            with gr.Row():
+                with gr.Column():
+                    with gr.Accordion("Music Parameters", open=True):
+                        music_duration = gr.Slider(10, 120, value=30, label="Duration (seconds)")
+                        music_temp = gr.Slider(0.1, 2.0, value=1.0, label="Creativity")
+                        guidance_scale = gr.Slider(1.0, 5.0, value=3.0, label="Focus")
+                    music_btn = gr.Button("🎵 Generate Music", variant="primary")
+                with gr.Column():
+                    music_preview = gr.Audio(label="Preview", interactive=False)
+                    music_btn.click(
+                        generate_music,
+                        inputs=[music_output, music_duration, music_temp, guidance_scale],
+                        outputs=music_preview
+                    )
+        with gr.Tab("🔊 Final Mix"):
+            with gr.Row():
+                with gr.Column():
+                    ducking_toggle = gr.Checkbox(value=True, label="Enable Voice Ducking")
+                    duck_level = gr.Slider(0, 30, value=12, label="Ducking Strength (dB)")
+                    crossfade_time = gr.Slider(0, 2000, value=500, label="Crossfade (ms)")
+                    mix_btn = gr.Button("🚀 Create Final Mix", variant="primary")
+                with gr.Column():
+                    final_mix = gr.Audio(label="Master Output", interactive=False)
+                    mix_btn.click(
+                        blend_audio,
+                        inputs=[voice_preview, music_preview, ducking_toggle, duck_level, crossfade_time],
+                        outputs=final_mix
+                    )
+    with gr.Accordion("📚 Example Prompts", open=False):
+        gr.Examples(
+            examples=[
+                ["A 30-second tech podcast intro with futuristic sounds"],
+                ["A 15-second radio ad for a coffee shop with morning vibes"],
+                ["A 60-second documentary trailer with epic orchestral music"]
+            ],
+            inputs=concept_input
+        )
+    with gr.Row():
+        gr.Markdown("### System Resources")
+        gpu_status = gr.Textbox(label="GPU Utilization", interactive=False)
+        ram_status = gr.Textbox(label="RAM Usage", interactive=False)
+    # Custom Footer
+    gr.Markdown("""
+    <hr>
+    <p style="text-align: center; font-size: 0.9em;">
+        Created with ❤️ by <a href="https://bilsimaging.com" target="_blank">bilsimaging.com</a>
+    </p>
+    """)
+    gr.HTML("""
+    <a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold">
+        <img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold&countColor=%23263759" />
+    </a>
+    """)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)