Spaces:

Bils
/

AIPromoStudio

Running on Zero

File size: 16,921 Bytes

a6afe59
a765116
d448add
d3df06a
 
 
 
 
 
 
 
db46bfb
1c1b50f
 
db46bfb
1c1b50f
db8ba25
db46bfb
a8a7982
019c404
3168a3e
a8a7982
 
 
cf3593c
3e34a93
5607a62
a8a7982
 
 
3e34a93
 
 
 
a765116
 
 
 
 
 
 
653eb14
a765116
 
a8a7982
 
 
3e34a93
a8a7982
 
 
3e34a93
 
cc173f9
3e34a93
 
 
 
 
 
 
 
 
 
 
 
cc173f9
3e34a93
a8a7982
 
 
 
3e34a93
 
cc173f9
3e34a93
 
653eb14
3e34a93
 
 
 
 
cc173f9
3e34a93
a8a7982
 
 
3e34a93
 
cc173f9
3e34a93
 
 
 
cc173f9
a8a7982
 
 
3e34a93
 
f2c044d
a8a7982
 
f2c044d
dfa5d3e
3e34a93
cc173f9
f2c044d
a8a7982
 
 
 
a765116
f2c044d
a8a7982
cc173f9
3e34a93
 
a8a7982
 
3e34a93
a8a7982
3e34a93
cc173f9
a8a7982
 
 
cc173f9
a8a7982
 
 
 
cc173f9
 
a8a7982
 
 
 
 
 
 
cc173f9
 
a8a7982
 
 
 
 
 
 
cc173f9
 
a8a7982
 
 
cc173f9
a8a7982
cc173f9
b950350
a8a7982
0105281
bcd1e5d
a8a7982
 
 
3e34a93
a8a7982
f2c044d
a8a7982
 
f2c044d
b950350
559ca26
a8a7982
cc173f9
 
a765116
cc173f9
3e34a93
cc173f9
 
a8a7982
a765116
3e34a93
cc173f9
f2c044d
a8a7982
f2c044d
cc173f9
a8a7982
 
 
89daa1e
3e34a93
f2c044d
a8a7982
 
f2c044d
17d10a7
a8a7982
 
cc173f9
a8a7982
 
cc173f9
3e34a93
a6afe59
cc173f9
3e34a93
a8a7982
cc173f9
3e34a93
a8a7982
cc173f9
d3df06a
3e34a93
cc173f9
3e34a93
cc173f9
cf3593c
a8a7982
 
cc173f9
a8a7982
 
 
3e34a93
a8a7982
f2c044d
a8a7982
cc173f9
 
 
a8a7982
f2c044d
ecc69bf
a8a7982
 
cc173f9
559ca26
 
cc173f9
 
 
 
 
a8a7982
 
 
 
 
cc173f9
 
a8a7982
 
cc173f9
559ca26
3e34a93
 
 
 
cc173f9
a8a7982
3e34a93
 
cc173f9
d9bf0f0
a8a7982
 
cc173f9
a8a7982
d3df06a
a8a7982
d3df06a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc173f9
d3df06a
 
 
 
 
 
 
 
cc173f9
 
 
 
d3df06a
 
 
 
 
 
 
 
 
 
 
 
 
a6afe59
 
d3df06a
a8a7982
d3df06a
a6afe59
cc173f9
 
 
 
 
d3df06a
a8a7982
 
cc173f9
d3df06a
a8a7982
 
 
cc173f9
a8a7982
 
d3df06a
a8a7982
 
 
 
 
 
 
 
 
 
 
 
cc173f9
 
a8a7982
 
cc173f9
a8a7982
 
 
cc173f9
a8a7982
 
cc173f9
d3df06a
cc173f9
a8a7982
 
 
 
 
 
 
 
 
 
cc173f9
a8a7982
cc173f9
a8a7982
 
cc173f9
 
a8a7982
 
cc173f9
d3df06a
cc173f9
a8a7982
 
 
 
 
 
d3df06a
a8a7982
cc173f9
a8a7982
cc173f9
a8a7982
 
 
cc173f9
a8a7982
 
cc173f9
d3df06a
cc173f9
a8a7982
 
 
 
 
 
 
 
cc173f9
a8a7982
cc173f9
a8a7982
 
 
 
 
3fe530b
a8a7982
 
d3df06a
 
 
 
a6afe59
d3df06a
a8a7982
 
cc173f9
a8a7982
d3df06a
 
 
 
 
a8a7982

# import os
import re
import torch
import tempfile
from scipy.io.wavfile import write
from pydub import AudioSegment
from dotenv import load_dotenv
import spaces
import gradio as gr

# Transformers & Models
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    AutoProcessor,
    MusicgenForConditionalGeneration,
)
# Coqui TTS
from TTS.api import TTS

# ---------------------------------------------------------------------
# Load Environment Variables
# ---------------------------------------------------------------------
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

# ---------------------------------------------------------------------
# Global Model Caches
# ---------------------------------------------------------------------
LLAMA_PIPELINES = {}
MUSICGEN_MODELS = {}
TTS_MODELS = {}

# ---------------------------------------------------------------------
# Utility Function: Clean Text
# ---------------------------------------------------------------------
def clean_text(text: str) -> str:
    """
    Removes undesired characters (e.g., asterisks) that might not be recognized by the model's vocabulary.
    """
    # Remove all asterisks. You can add more cleaning steps here as needed.
    return re.sub(r'\*', '', text)

# ---------------------------------------------------------------------
# Helper Functions
# ---------------------------------------------------------------------
def get_llama_pipeline(model_id: str, token: str):
    """
    Returns a cached LLaMA pipeline if available; otherwise, loads it.
    """
    if model_id in LLAMA_PIPELINES:
        return LLAMA_PIPELINES[model_id]

    tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        use_auth_token=token,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
    )
    text_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
    LLAMA_PIPELINES[model_id] = text_pipeline
    return text_pipeline


def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
    """
    Returns a cached MusicGen model if available; otherwise, loads it.
    Uses the 'large' variant for higher quality outputs.
    """
    if model_key in MUSICGEN_MODELS:
        return MUSICGEN_MODELS[model_key]

    model = MusicgenForConditionalGeneration.from_pretrained(model_key)
    processor = AutoProcessor.from_pretrained(model_key)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    MUSICGEN_MODELS[model_key] = (model, processor)
    return model, processor


def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
    """
    Returns a cached TTS model if available; otherwise, loads it.
    """
    if model_name in TTS_MODELS:
        return TTS_MODELS[model_name]

    tts_model = TTS(model_name)
    TTS_MODELS[model_name] = tts_model
    return tts_model


# ---------------------------------------------------------------------
# Script Generation Function
# ---------------------------------------------------------------------
@spaces.GPU(duration=100)
def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
    """
    Generates a script, sound design suggestions, and music ideas from a user prompt.
    Returns a tuple of strings: (voice_script, sound_design, music_suggestions).
    """
    try:
        text_pipeline = get_llama_pipeline(model_id, token)

        system_prompt = (
            "You are an expert radio imaging producer specializing in sound design and music. "
            f"Based on the user's concept and the selected duration of {duration} seconds, produce the following: "
            "1. A concise voice-over script. Prefix this section with 'Voice-Over Script:'.\n"
            "2. Suggestions for sound design. Prefix this section with 'Sound Design Suggestions:'.\n"
            "3. Music styles or track recommendations. Prefix this section with 'Music Suggestions:'."
        )
        combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nOutput:"

        with torch.inference_mode():
            result = text_pipeline(
                combined_prompt,
                max_new_tokens=300,
                do_sample=True,
                temperature=0.8
            )

        generated_text = result[0]["generated_text"]
        if "Output:" in generated_text:
            generated_text = generated_text.split("Output:")[-1].strip()

        # Default placeholders
        voice_script = "No voice-over script found."
        sound_design = "No sound design suggestions found."
        music_suggestions = "No music suggestions found."

        # Voice-Over Script
        if "Voice-Over Script:" in generated_text:
            parts = generated_text.split("Voice-Over Script:")
            voice_script_part = parts[1]
            if "Sound Design Suggestions:" in voice_script_part:
                voice_script = voice_script_part.split("Sound Design Suggestions:")[0].strip()
            else:
                voice_script = voice_script_part.strip()

        # Sound Design
        if "Sound Design Suggestions:" in generated_text:
            parts = generated_text.split("Sound Design Suggestions:")
            sound_design_part = parts[1]
            if "Music Suggestions:" in sound_design_part:
                sound_design = sound_design_part.split("Music Suggestions:")[0].strip()
            else:
                sound_design = sound_design_part.strip()

        # Music Suggestions
        if "Music Suggestions:" in generated_text:
            parts = generated_text.split("Music Suggestions:")
            music_suggestions = parts[1].strip()

        return voice_script, sound_design, music_suggestions

    except Exception as e:
        return f"Error generating script: {e}", "", ""


# ---------------------------------------------------------------------
# Voice-Over Generation Function
# ---------------------------------------------------------------------
@spaces.GPU(duration=100)
def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
    """
    Generates a voice-over from the provided script using the Coqui TTS model.
    Returns the file path to the generated .wav file.
    """
    try:
        if not script.strip():
            return "Error: No script provided."

        # Clean the script to remove special characters (e.g., asterisks) that may produce warnings
        cleaned_script = clean_text(script)

        tts_model = get_tts_model(tts_model_name)

        # Generate and save voice
        output_path = os.path.join(tempfile.gettempdir(), "voice_over.wav")
        tts_model.tts_to_file(text=cleaned_script, file_path=output_path)
        return output_path

    except Exception as e:
        return f"Error generating voice: {e}"


# ---------------------------------------------------------------------
# Music Generation Function
# ---------------------------------------------------------------------
@spaces.GPU(duration=200)
def generate_music(prompt: str, audio_length: int):
    """
    Generates music from the 'facebook/musicgen-large' model based on the prompt.
    Returns the file path to the generated .wav file.
    """
    try:
        if not prompt.strip():
            return "Error: No music suggestion provided."

        model_key = "facebook/musicgen-large"
        musicgen_model, musicgen_processor = get_musicgen_model(model_key)

        device = "cuda" if torch.cuda.is_available() else "cpu"
        inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt").to(device)

        with torch.inference_mode():
            outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length)

        audio_data = outputs[0, 0].cpu().numpy()
        normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")

        output_path = os.path.join(tempfile.gettempdir(), "musicgen_large_generated_music.wav")
        write(output_path, 44100, normalized_audio)

        return output_path

    except Exception as e:
        return f"Error generating music: {e}"


# ---------------------------------------------------------------------
# Audio Blending with Duration Sync & Ducking
# ---------------------------------------------------------------------
@spaces.GPU(duration=100)
def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int = 10):
    """
    Blends two audio files (voice and music).
    1. If music < voice, loops the music until it meets/exceeds the voice duration.
    2. If music > voice, trims music to the voice duration.
    3. If ducking=True, the music is attenuated by 'duck_level' dB while the voice is playing.
    Returns the file path to the blended .wav file.
    """
    try:
        if not os.path.isfile(voice_path) or not os.path.isfile(music_path):
            return "Error: Missing audio files for blending."

        voice = AudioSegment.from_wav(voice_path)
        music = AudioSegment.from_wav(music_path)

        voice_len = len(voice)  # in milliseconds
        music_len = len(music)  # in milliseconds

        # Loop music if it's shorter than the voice
        if music_len < voice_len:
            looped_music = AudioSegment.empty()
            while len(looped_music) < voice_len:
                looped_music += music
            music = looped_music

        # Trim music if it's longer than the voice
        if len(music) > voice_len:
            music = music[:voice_len]

        if ducking:
            ducked_music = music - duck_level
            final_audio = ducked_music.overlay(voice)
        else:
            final_audio = music.overlay(voice)

        output_path = os.path.join(tempfile.gettempdir(), "blended_output.wav")
        final_audio.export(output_path, format="wav")
        return output_path

    except Exception as e:
        return f"Error blending audio: {e}"


# ---------------------------------------------------------------------
# Gradio Interface with Enhanced UI
# ---------------------------------------------------------------------
with gr.Blocks(css="""
    /* Global Styles */
    body {
        background: linear-gradient(135deg, #1d1f21, #3a3d41);
        color: #f0f0f0;
        font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    }
    .header {
        text-align: center;
        padding: 2rem 1rem;
        background: linear-gradient(90deg, #6a11cb, #2575fc);
        border-radius: 0 0 20px 20px;
        margin-bottom: 2rem;
    }
    .header h1 {
        margin: 0;
        font-size: 2.5rem;
    }
    .header p {
        font-size: 1.2rem;
    }
    .gradio-container {
        background: #2e2e2e;
        border-radius: 10px;
        padding: 1rem;
    }
    .tab-title {
        font-size: 1.1rem;
        font-weight: bold;
    }
    .footer {
        text-align: center;
        font-size: 0.9em;
        margin-top: 2rem;
        padding: 1rem;
        color: #cccccc;
    }
""") as demo:

    # Custom Header
    with gr.Row(elem_classes="header"):
        gr.Markdown("""
        <h1>🎧 AI Ads Promo</h1>
        <p>Your all-in-one AI solution for crafting engaging audio ads. <br><em>Demo MVP</em></p>
        """)

    gr.Markdown("""
    Welcome to **AI Ads Promo (Demo MVP)**! This platform leverages state-of-the-art AI models to help you generate:
    
    - **Script**: Generate a compelling voice-over script with LLaMA.
    - **Voice Synthesis**: Create natural-sounding voice-overs using Coqui TTS.
    - **Music Production**: Produce custom music tracks with MusicGen.
    - **Audio Blending**: Seamlessly blend voice and music with options for ducking.
    """)

    with gr.Tabs():
        # Step 1: Generate Script
        with gr.Tab("📝 Script Generation"):
            with gr.Row():
                user_prompt = gr.Textbox(
                    label="Promo Idea", 
                    placeholder="E.g., A 30-second promo for a morning show...",
                    lines=2
                )
            with gr.Row():
                llama_model_id = gr.Textbox(
                    label="LLaMA Model ID", 
                    value="meta-llama/Meta-Llama-3-8B-Instruct", 
                    placeholder="Enter a valid Hugging Face model ID"
                )
                duration = gr.Slider(
                    label="Desired Promo Duration (seconds)",
                    minimum=15, 
                    maximum=60, 
                    step=15, 
                    value=30
                )
            generate_script_button = gr.Button("Generate Script", variant="primary")
            script_output = gr.Textbox(label="Generated Voice-Over Script", lines=5, interactive=False)
            sound_design_output = gr.Textbox(label="Sound Design Suggestions", lines=3, interactive=False)
            music_suggestion_output = gr.Textbox(label="Music Suggestions", lines=3, interactive=False)

            generate_script_button.click(
                fn=lambda user_prompt, model_id, dur: generate_script(user_prompt, model_id, HF_TOKEN, dur),
                inputs=[user_prompt, llama_model_id, duration],
                outputs=[script_output, sound_design_output, music_suggestion_output],
            )

        # Step 2: Generate Voice
        with gr.Tab("🎤 Voice Synthesis"):
            gr.Markdown("Generate a natural-sounding voice-over using Coqui TTS.")
            selected_tts_model = gr.Dropdown(
                label="TTS Model",
                choices=[
                    "tts_models/en/ljspeech/tacotron2-DDC",  
                    "tts_models/en/ljspeech/vits", 
                    "tts_models/en/sam/tacotron-DDC", 
                ],
                value="tts_models/en/ljspeech/tacotron2-DDC",
                multiselect=False
            )
            generate_voice_button = gr.Button("Generate Voice-Over", variant="primary")
            voice_audio_output = gr.Audio(label="Voice-Over (WAV)", type="filepath")

            generate_voice_button.click(
                fn=lambda script, tts_model: generate_voice(script, tts_model),
                inputs=[script_output, selected_tts_model],
                outputs=voice_audio_output,
            )

        # Step 3: Generate Music
        with gr.Tab("🎶 Music Production"):
            gr.Markdown("Generate a custom music track using the **MusicGen Large** model.")
            audio_length = gr.Slider(
                label="Music Length (tokens)",
                minimum=128, 
                maximum=1024, 
                step=64, 
                value=512,
                info="Increase tokens for longer audio (inference time may vary)."
            )
            generate_music_button = gr.Button("Generate Music", variant="primary")
            music_output = gr.Audio(label="Generated Music (WAV)", type="filepath")

            generate_music_button.click(
                fn=lambda music_suggestion, length: generate_music(music_suggestion, length),
                inputs=[music_suggestion_output, audio_length],
                outputs=[music_output],
            )

        # Step 4: Blend Audio
        with gr.Tab("🎚️ Audio Blending"):
            gr.Markdown("Blend your voice-over and music track. Music will be looped/truncated to match the voice duration. Enable ducking to lower the music during voice segments.")
            ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
            duck_level_slider = gr.Slider(
                label="Ducking Level (dB attenuation)", 
                minimum=0, 
                maximum=20, 
                step=1, 
                value=10
            )
            blend_button = gr.Button("Blend Voice + Music", variant="primary")
            blended_output = gr.Audio(label="Final Blended Output (WAV)", type="filepath")

            blend_button.click(
                fn=blend_audio,
                inputs=[voice_audio_output, music_output, ducking_checkbox, duck_level_slider],
                outputs=blended_output
            )

    # Footer
    gr.Markdown("""
    <div class="footer">
        <hr>
        Created with ❤️ by <a href="https://bilsimaging.com" target="_blank" style="color: #88aaff;">bilsimaging.com</a>
        <br>
        <small>AI Ads Promo (Demo MVP) &copy; 2025</small>
    </div>
    """)
    
    # Visitor Badge
    gr.HTML("""
    <div style="text-align: center; margin-top: 1rem;">
        <a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold">
            <img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold&countColor=%23263759" alt="visitor badge"/>
        </a>
    </div>
    """)

demo.launch(debug=True)