Spaces:

Bils
/

AIPromoStudio

Running on Zero

File size: 5,224 Bytes

17d10a7
a15d204
d448add
db46bfb
 
 
 
 
 
 
17d10a7
c243adb
 
f0b5707
d0384c8
f0b5707
613bd9e
7bbdf94
 
613bd9e
 
 
 
f0b5707
7bbdf94
 
613bd9e
f0b5707
613bd9e
7bbdf94
d0384c8
 
17d10a7
d0384c8
17d10a7
 
 
 
 
 
 
 
 
 
 
d0384c8
 
17d10a7
d0384c8
 
17d10a7
 
 
 
 
 
d0384c8
 
17d10a7
3fe530b
17d10a7
 
 
 
 
 
 
 
 
7232157
17d10a7
 
d448add
db46bfb
17d10a7
db46bfb
17d10a7
f0b5707
 
17d10a7
 
d448add
17d10a7
 
d448add
17d10a7
 
 
 
a15d204
17d10a7
7232157
 
 
c243adb
7232157
d448add
db46bfb
17d10a7
c243adb
17d10a7
f0b5707
17d10a7
 
 
 
 
d448add
17d10a7
 
7232157
d448add
17d10a7
 
 
3fe530b
 
17d10a7
db46bfb
7bbdf94

import gradio as gr
import os
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    pipeline,
    AutoProcessor, 
    MusicgenForConditionalGeneration
)
import scipy.io.wavfile as wav

# ---------------------------------------------------------------------
# Load Llama 3 Model with Zero GPU
# ---------------------------------------------------------------------
def load_llama_pipeline_zero_gpu(model_id: str, token: str):
    try:
        if not torch.cuda.is_available():
            raise RuntimeError("ZeroGPU is not properly initialized or GPU is unavailable.")
        tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            use_auth_token=token,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True
        )
        return pipeline("text-generation", model=model, tokenizer=tokenizer)
    except Exception as e:
        return f"Error loading model: {e}"

# ---------------------------------------------------------------------
# Generate Radio Script
# ---------------------------------------------------------------------
def generate_script(user_input: str, pipeline_llama):
    try:
        system_prompt = (
            "You are a top-tier radio imaging producer using Llama 3. "
            "Take the user's concept and craft a short, creative promo script."
        )
        combined_prompt = f"{system_prompt}\nUser concept: {user_input}\nRefined script:"
        result = pipeline_llama(combined_prompt, max_new_tokens=200, do_sample=True, temperature=0.9)
        return result[0]['generated_text'].split("Refined script:")[-1].strip()
    except Exception as e:
        return f"Error generating script: {e}"

# ---------------------------------------------------------------------
# Load MusicGen Model
# ---------------------------------------------------------------------
def load_musicgen_model():
    try:
        model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
        processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
        return model, processor
    except Exception as e:
        return None, str(e)

# ---------------------------------------------------------------------
# Generate Audio
# ---------------------------------------------------------------------
def generate_audio(prompt: str, audio_length: int, mg_model, mg_processor):
    try:
        inputs = mg_processor(text=[prompt], padding=True, return_tensors="pt")
        outputs = mg_model.generate(**inputs, max_new_tokens=audio_length)
        sr = mg_model.config.audio_encoder.sampling_rate
        audio_data = outputs[0, 0].cpu().numpy()
        normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")
        output_file = "radio_jingle.wav"
        wav.write(output_file, rate=sr, data=normalized_audio)
        return sr, normalized_audio
    except Exception as e:
        return str(e)

# ---------------------------------------------------------------------
# Gradio Interface
# ---------------------------------------------------------------------
def radio_imaging_app(user_prompt, llama_model_id, hf_token, audio_length):
    # Load Llama 3 Pipeline with Zero GPU
    pipeline_llama = load_llama_pipeline_zero_gpu(llama_model_id, hf_token)
    if isinstance(pipeline_llama, str):
        return pipeline_llama, None

    # Generate Script
    script = generate_script(user_prompt, pipeline_llama)

    # Load MusicGen
    mg_model, mg_processor = load_musicgen_model()
    if isinstance(mg_processor, str):
        return script, mg_processor

    # Generate Audio
    audio_data = generate_audio(script, audio_length, mg_model, mg_processor)
    if isinstance(audio_data, str):
        return script, audio_data

    return script, audio_data

# ---------------------------------------------------------------------
# Interface
# ---------------------------------------------------------------------
with gr.Blocks() as demo:
    gr.Markdown("# 🎧 AI Radio Imaging with Llama 3 + MusicGen (Zero GPU)")
    with gr.Row():
        user_prompt = gr.Textbox(label="Enter your promo idea", placeholder="E.g., A 15-second hype jingle for a morning talk show, fun and energetic.")
        llama_model_id = gr.Textbox(label="Llama 3 Model ID", value="meta-llama/Meta-Llama-3-70B")
        hf_token = gr.Textbox(label="Hugging Face Token", type="password")
        audio_length = gr.Slider(label="Audio Length (tokens)", minimum=128, maximum=1024, step=64, value=512)

    generate_button = gr.Button("Generate Promo Script and Audio")
    script_output = gr.Textbox(label="Generated Script")
    audio_output = gr.Audio(label="Generated Audio", type="numpy")

    generate_button.click(radio_imaging_app, 
                          inputs=[user_prompt, llama_model_id, hf_token, audio_length], 
                          outputs=[script_output, audio_output])

# ---------------------------------------------------------------------
# Launch App
# ---------------------------------------------------------------------
demo.launch(debug=True)