Spaces:

Bils
/

AIPromoStudio

Running on Zero

App Files Files Community

Bils commited on Feb 28

Commit

9a49723

verified ·

1 Parent(s): d45e0a7

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -73

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import re
 import torch
 import tempfile
 from scipy.io.wavfile import write
 from pydub import AudioSegment
 from dotenv import load_dotenv
@@ -20,8 +21,9 @@ from transformers import (
 from TTS.api import TTS
 # ---------------------------------------------------------------------
-# Load Environment Variables
 # ---------------------------------------------------------------------
 load_dotenv()
 HF_TOKEN = os.getenv("HF_TOKEN")
@@ -39,15 +41,14 @@ def clean_text(text: str) -> str:
     """
     Removes undesired characters (e.g., asterisks) that might not be recognized by the model's vocabulary.
     """
-    # Remove all asterisks. You can add more cleaning steps here as needed.
     return re.sub(r'\*', '', text)
 # ---------------------------------------------------------------------
-# Helper Functions
 # ---------------------------------------------------------------------
 def get_llama_pipeline(model_id: str, token: str):
     """
-    Returns a cached LLaMA pipeline if available; otherwise, loads it.
     """
     if model_id in LLAMA_PIPELINES:
         return LLAMA_PIPELINES[model_id]
@@ -67,7 +68,7 @@ def get_llama_pipeline(model_id: str, token: str):
 def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
     """
-    Returns a cached MusicGen model if available; otherwise, loads it.
     Uses the 'large' variant for higher quality outputs.
     """
     if model_key in MUSICGEN_MODELS:
@@ -75,7 +76,6 @@ def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
     model = MusicgenForConditionalGeneration.from_pretrained(model_key)
     processor = AutoProcessor.from_pretrained(model_key)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model.to(device)
     MUSICGEN_MODELS[model_key] = (model, processor)
@@ -84,7 +84,7 @@ def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
 def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
     """
-    Returns a cached TTS model if available; otherwise, loads it.
     """
     if model_name in TTS_MODELS:
         return TTS_MODELS[model_name]
@@ -100,18 +100,18 @@ def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
 @spaces.GPU(duration=100)
 def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
     """
-    Generates a script, sound design suggestions, and music ideas from a user prompt.
-    Returns a tuple of strings: (voice_script, sound_design, music_suggestions).
     """
     try:
         text_pipeline = get_llama_pipeline(model_id, token)
         system_prompt = (
             "You are an expert radio imaging producer specializing in sound design and music. "
-            f"Based on the user's concept and the selected duration of {duration} seconds, produce the following: "
-            "1. A concise voice-over script. Prefix this section with 'Voice-Over Script:'.\n"
-            "2. Suggestions for sound design. Prefix this section with 'Sound Design Suggestions:'.\n"
-            "3. Music styles or track recommendations. Prefix this section with 'Music Suggestions:'."
         )
         combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nOutput:"
@@ -127,37 +127,20 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
         if "Output:" in generated_text:
             generated_text = generated_text.split("Output:")[-1].strip()
-        # Default placeholders
-        voice_script = "No voice-over script found."
-        sound_design = "No sound design suggestions found."
-        music_suggestions = "No music suggestions found."
-        # Voice-Over Script
-        if "Voice-Over Script:" in generated_text:
-            parts = generated_text.split("Voice-Over Script:")
-            voice_script_part = parts[1]
-            if "Sound Design Suggestions:" in voice_script_part:
-                voice_script = voice_script_part.split("Sound Design Suggestions:")[0].strip()
-            else:
-                voice_script = voice_script_part.strip()
-        # Sound Design
-        if "Sound Design Suggestions:" in generated_text:
-            parts = generated_text.split("Sound Design Suggestions:")
-            sound_design_part = parts[1]
-            if "Music Suggestions:" in sound_design_part:
-                sound_design = sound_design_part.split("Music Suggestions:")[0].strip()
-            else:
-                sound_design = sound_design_part.strip()
-        # Music Suggestions
-        if "Music Suggestions:" in generated_text:
-            parts = generated_text.split("Music Suggestions:")
-            music_suggestions = parts[1].strip()
         return voice_script, sound_design, music_suggestions
     except Exception as e:
         return f"Error generating script: {e}", "", ""
@@ -167,24 +150,22 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
 @spaces.GPU(duration=100)
 def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
     """
-    Generates a voice-over from the provided script using the Coqui TTS model.
     Returns the file path to the generated .wav file.
     """
     try:
         if not script.strip():
             return "Error: No script provided."
-        # Clean the script to remove special characters (e.g., asterisks) that may produce warnings
         cleaned_script = clean_text(script)
         tts_model = get_tts_model(tts_model_name)
-        # Generate and save voice
         output_path = os.path.join(tempfile.gettempdir(), "voice_over.wav")
         tts_model.tts_to_file(text=cleaned_script, file_path=output_path)
         return output_path
     except Exception as e:
         return f"Error generating voice: {e}"
@@ -194,7 +175,7 @@ def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/ta
 @spaces.GPU(duration=200)
 def generate_music(prompt: str, audio_length: int):
     """
-    Generates music from the 'facebook/musicgen-large' model based on the prompt.
     Returns the file path to the generated .wav file.
     """
     try:
@@ -203,10 +184,9 @@ def generate_music(prompt: str, audio_length: int):
         model_key = "facebook/musicgen-large"
         musicgen_model, musicgen_processor = get_musicgen_model(model_key)
         device = "cuda" if torch.cuda.is_available() else "cpu"
-        inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt").to(device)
         with torch.inference_mode():
             outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length)
@@ -219,6 +199,7 @@ def generate_music(prompt: str, audio_length: int):
         return output_path
     except Exception as e:
         return f"Error generating music: {e}"
@@ -229,9 +210,9 @@ def generate_music(prompt: str, audio_length: int):
 def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int = 10):
     """
     Blends two audio files (voice and music).
-    1. If music < voice, loops the music until it meets/exceeds the voice duration.
-    2. If music > voice, trims music to the voice duration.
-    3. If ducking=True, the music is attenuated by 'duck_level' dB while the voice is playing.
     Returns the file path to the blended .wav file.
     """
     try:
@@ -242,18 +223,16 @@ def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int
         music = AudioSegment.from_wav(music_path)
         voice_len = len(voice)  # in milliseconds
-        music_len = len(music)  # in milliseconds
-        # Loop music if it's shorter than the voice
-        if music_len < voice_len:
             looped_music = AudioSegment.empty()
             while len(looped_music) < voice_len:
                 looped_music += music
             music = looped_music
-        # Trim music if it's longer than the voice
-        if len(music) > voice_len:
-            music = music[:voice_len]
         if ducking:
             ducked_music = music - duck_level
@@ -266,11 +245,12 @@ def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int
         return output_path
     except Exception as e:
         return f"Error blending audio: {e}"
 # ---------------------------------------------------------------------
-# Gradio Interface with Enhanced UI
 # ---------------------------------------------------------------------
 with gr.Blocks(css="""
     /* Global Styles */
@@ -314,26 +294,26 @@ with gr.Blocks(css="""
     # Custom Header
     with gr.Row(elem_classes="header"):
         gr.Markdown("""
-        <h1>🎧 AI Promo Studio</h1>
-        <p>Your all-in-one AI solution for crafting engaging audio promos.</p>
         """)
     gr.Markdown("""
-    Welcome to **AI Promo Studio**! This platform leverages state-of-the-art AI models to help you generate:
-    - **Script**: Generate a compelling voice-over script with LLaMA.
-    - **Voice Synthesis**: Create natural-sounding voice-overs using Coqui TTS.
-    - **Music Production**: Produce custom music tracks with MusicGen.
-    - **Audio Blending**: Seamlessly blend voice and music with options for ducking.
     """)
     with gr.Tabs():
-        # Step 1: Generate Script
         with gr.Tab("📝 Script Generation"):
             with gr.Row():
                 user_prompt = gr.Textbox(
                     label="Promo Idea",
-                    placeholder="E.g., A 30-second promo for a morning show...",
                     lines=2
                 )
             with gr.Row():
@@ -343,7 +323,7 @@ with gr.Blocks(css="""
                     placeholder="Enter a valid Hugging Face model ID"
                 )
                 duration = gr.Slider(
-                    label="Desired Promo Duration (seconds)",
                     minimum=15,
                     maximum=60,
                     step=15,
@@ -355,12 +335,12 @@ with gr.Blocks(css="""
             music_suggestion_output = gr.Textbox(label="Music Suggestions", lines=3, interactive=False)
             generate_script_button.click(
-                fn=lambda user_prompt, model_id, dur: generate_script(user_prompt, model_id, HF_TOKEN, dur),
                 inputs=[user_prompt, llama_model_id, duration],
                 outputs=[script_output, sound_design_output, music_suggestion_output],
             )
-        # Step 2: Generate Voice
         with gr.Tab("🎤 Voice Synthesis"):
             gr.Markdown("Generate a natural-sounding voice-over using Coqui TTS.")
             selected_tts_model = gr.Dropdown(
@@ -382,7 +362,7 @@ with gr.Blocks(css="""
                 outputs=voice_audio_output,
             )
-        # Step 3: Generate Music
         with gr.Tab("🎶 Music Production"):
             gr.Markdown("Generate a custom music track using the **MusicGen Large** model.")
             audio_length = gr.Slider(
@@ -397,12 +377,12 @@ with gr.Blocks(css="""
             music_output = gr.Audio(label="Generated Music (WAV)", type="filepath")
             generate_music_button.click(
-                fn=lambda music_suggestion, length: generate_music(music_suggestion, length),
                 inputs=[music_suggestion_output, audio_length],
                 outputs=[music_output],
             )
-        # Step 4: Blend Audio
         with gr.Tab("🎚️ Audio Blending"):
             gr.Markdown("Blend your voice-over and music track. Music will be looped/truncated to match the voice duration. Enable ducking to lower the music during voice segments.")
             ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
@@ -428,7 +408,7 @@ with gr.Blocks(css="""
         <hr>
         Created with ❤️ by <a href="https://bilsimaging.com" target="_blank" style="color: #88aaff;">bilsimaging.com</a>
         <br>
-        <small>AI Promo Studio &copy; 2025</small>
     </div>
     """)

 import re
 import torch
 import tempfile
+import logging
 from scipy.io.wavfile import write
 from pydub import AudioSegment
 from dotenv import load_dotenv
 from TTS.api import TTS
 # ---------------------------------------------------------------------
+# Setup Logging and Environment Variables
 # ---------------------------------------------------------------------
+logging.basicConfig(level=logging.INFO)
 load_dotenv()
 HF_TOKEN = os.getenv("HF_TOKEN")
     """
     Removes undesired characters (e.g., asterisks) that might not be recognized by the model's vocabulary.
     """
     return re.sub(r'\*', '', text)
 # ---------------------------------------------------------------------
+# Model Helper Functions
 # ---------------------------------------------------------------------
 def get_llama_pipeline(model_id: str, token: str):
     """
+    Returns a cached LLaMA text-generation pipeline if available; otherwise, loads and caches it.
     """
     if model_id in LLAMA_PIPELINES:
         return LLAMA_PIPELINES[model_id]
 def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
     """
+    Returns a cached MusicGen model and processor if available; otherwise, loads and caches them.
     Uses the 'large' variant for higher quality outputs.
     """
     if model_key in MUSICGEN_MODELS:
     model = MusicgenForConditionalGeneration.from_pretrained(model_key)
     processor = AutoProcessor.from_pretrained(model_key)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model.to(device)
     MUSICGEN_MODELS[model_key] = (model, processor)
 def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
     """
+    Returns a cached TTS model if available; otherwise, loads and caches it.
     """
     if model_name in TTS_MODELS:
         return TTS_MODELS[model_name]
 @spaces.GPU(duration=100)
 def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
     """
+    Generates a voice-over script, sound design suggestions, and music ideas from a user prompt.
+    Returns a tuple: (voice_script, sound_design, music_suggestions).
     """
     try:
         text_pipeline = get_llama_pipeline(model_id, token)
         system_prompt = (
             "You are an expert radio imaging producer specializing in sound design and music. "
+            f"Based on the user's concept and the selected duration of {duration} seconds, produce the following:\n"
+            "1. A concise voice-over script. Prefix this section with 'Voice-Over Script:'\n"
+            "2. Suggestions for sound design. Prefix this section with 'Sound Design Suggestions:'\n"
+            "3. Music styles or track recommendations. Prefix this section with 'Music Suggestions:'"
         )
         combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nOutput:"
         if "Output:" in generated_text:
             generated_text = generated_text.split("Output:")[-1].strip()
+        # Try to extract sections using regex; fall back to defaults if not found.
+        pattern = r"Voice-Over Script:\s*(.*?)\s*Sound Design Suggestions:\s*(.*?)\s*Music Suggestions:\s*(.*)"
+        match = re.search(pattern, generated_text, re.DOTALL)
+        if match:
+            voice_script, sound_design, music_suggestions = (grp.strip() for grp in match.groups())
+        else:
+            voice_script = "No voice-over script found."
+            sound_design = "No sound design suggestions found."
+            music_suggestions = "No music suggestions found."
         return voice_script, sound_design, music_suggestions
     except Exception as e:
+        logging.exception("Error generating script")
         return f"Error generating script: {e}", "", ""
 @spaces.GPU(duration=100)
 def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
     """
+    Generates a voice-over audio file from the provided script using Coqui TTS.
     Returns the file path to the generated .wav file.
     """
     try:
         if not script.strip():
             return "Error: No script provided."
         cleaned_script = clean_text(script)
         tts_model = get_tts_model(tts_model_name)
         output_path = os.path.join(tempfile.gettempdir(), "voice_over.wav")
         tts_model.tts_to_file(text=cleaned_script, file_path=output_path)
         return output_path
     except Exception as e:
+        logging.exception("Error generating voice")
         return f"Error generating voice: {e}"
 @spaces.GPU(duration=200)
 def generate_music(prompt: str, audio_length: int):
     """
+    Generates a music track from the 'facebook/musicgen-large' model based on the prompt.
     Returns the file path to the generated .wav file.
     """
     try:
         model_key = "facebook/musicgen-large"
         musicgen_model, musicgen_processor = get_musicgen_model(model_key)
         device = "cuda" if torch.cuda.is_available() else "cpu"
+        inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt").to(device)
         with torch.inference_mode():
             outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length)
         return output_path
     except Exception as e:
+        logging.exception("Error generating music")
         return f"Error generating music: {e}"
 def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int = 10):
     """
     Blends two audio files (voice and music).
+      - Loops music if shorter than voice.
+      - Trims music if longer than voice.
+      - Applies ducking to lower music volume during voice segments if enabled.
     Returns the file path to the blended .wav file.
     """
     try:
         music = AudioSegment.from_wav(music_path)
         voice_len = len(voice)  # in milliseconds
+        # Loop music if it's shorter than voice
+        if len(music) < voice_len:
             looped_music = AudioSegment.empty()
             while len(looped_music) < voice_len:
                 looped_music += music
             music = looped_music
+        # Trim music to match voice duration
+        music = music[:voice_len]
         if ducking:
             ducked_music = music - duck_level
         return output_path
     except Exception as e:
+        logging.exception("Error blending audio")
         return f"Error blending audio: {e}"
 # ---------------------------------------------------------------------
+# Gradio Interface with Enhanced UI for Ai Ads Promo
 # ---------------------------------------------------------------------
 with gr.Blocks(css="""
     /* Global Styles */
     # Custom Header
     with gr.Row(elem_classes="header"):
         gr.Markdown("""
+        <h1>🎧 Ai Ads Promo</h1>
+        <p>Your all-in-one AI solution for crafting engaging audio ads.</p>
         """)
     gr.Markdown("""
+    Welcome to **Ai Ads Promo**! This platform leverages state-of-the-art AI models to help you generate:
+    - **Script**: Create a compelling voice-over script using LLaMA.
+    - **Voice Synthesis**: Produce natural-sounding voice-overs with Coqui TTS.
+    - **Music Production**: Generate custom music tracks with MusicGen.
+    - **Audio Blending**: Seamlessly blend voice and music with optional ducking.
     """)
     with gr.Tabs():
+        # Step 1: Script Generation
         with gr.Tab("📝 Script Generation"):
             with gr.Row():
                 user_prompt = gr.Textbox(
                     label="Promo Idea",
+                    placeholder="E.g., A 30-second ad for a morning show...",
                     lines=2
                 )
             with gr.Row():
                     placeholder="Enter a valid Hugging Face model ID"
                 )
                 duration = gr.Slider(
+                    label="Desired Ad Duration (seconds)",
                     minimum=15,
                     maximum=60,
                     step=15,
             music_suggestion_output = gr.Textbox(label="Music Suggestions", lines=3, interactive=False)
             generate_script_button.click(
+                fn=lambda prompt, model_id, dur: generate_script(prompt, model_id, HF_TOKEN, dur),
                 inputs=[user_prompt, llama_model_id, duration],
                 outputs=[script_output, sound_design_output, music_suggestion_output],
             )
+        # Step 2: Voice Synthesis
         with gr.Tab("🎤 Voice Synthesis"):
             gr.Markdown("Generate a natural-sounding voice-over using Coqui TTS.")
             selected_tts_model = gr.Dropdown(
                 outputs=voice_audio_output,
             )
+        # Step 3: Music Production
         with gr.Tab("🎶 Music Production"):
             gr.Markdown("Generate a custom music track using the **MusicGen Large** model.")
             audio_length = gr.Slider(
             music_output = gr.Audio(label="Generated Music (WAV)", type="filepath")
             generate_music_button.click(
+                fn=lambda music_prompt, length: generate_music(music_prompt, length),
                 inputs=[music_suggestion_output, audio_length],
                 outputs=[music_output],
             )
+        # Step 4: Audio Blending
         with gr.Tab("🎚️ Audio Blending"):
             gr.Markdown("Blend your voice-over and music track. Music will be looped/truncated to match the voice duration. Enable ducking to lower the music during voice segments.")
             ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
         <hr>
         Created with ❤️ by <a href="https://bilsimaging.com" target="_blank" style="color: #88aaff;">bilsimaging.com</a>
         <br>
+        <small>Ai Ads Promo &copy; 2025</small>
     </div>
     """)