File size: 6,133 Bytes
17d10a7
a15d204
d448add
db46bfb
 
 
 
 
 
 
cf3593c
 
 
e7b189b
c243adb
dfa5d3e
cf3593c
 
 
e7b189b
 
 
 
 
dfa5d3e
e7b189b
dfa5d3e
cf3593c
f0b5707
e7b189b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d0384c8
dfa5d3e
 
 
e7b189b
dfa5d3e
 
 
 
 
 
e7b189b
dfa5d3e
 
 
 
 
 
 
cf3593c
e7b189b
 
 
 
 
17d10a7
dfa5d3e
17d10a7
 
dfa5d3e
cf3593c
17d10a7
 
 
d448add
cf3593c
 
 
 
 
d448add
dfa5d3e
 
 
b50e3e1
e7b189b
 
 
dfa5d3e
 
e7b189b
b50e3e1
dfa5d3e
b50e3e1
e7b189b
dfa5d3e
 
 
 
17d10a7
f0b5707
b50e3e1
 
70d35c8
 
 
 
 
 
 
 
 
 
 
 
 
b50e3e1
 
70d35c8
 
 
 
 
 
 
 
 
 
 
 
3fe530b
a8c9cb5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import gradio as gr
import os
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    pipeline,
    AutoProcessor, 
    MusicgenForConditionalGeneration
)
from scipy.io.wavfile import write
import tempfile
from dotenv import load_dotenv
import spaces  

# Load environment variables (e.g., Hugging Face token)
load_dotenv()
hf_token = os.getenv("HF_TOKEN")

# Globals for Lazy Loading
llama_pipeline = None
musicgen_model = None
musicgen_processor = None

# ---------------------------------------------------------------------
# Load Llama 3 Model with Zero GPU (Lazy Loading)
# ---------------------------------------------------------------------
@spaces.GPU(duration=120)
def load_llama_pipeline_zero_gpu(model_id: str, token: str):
    global llama_pipeline
    if llama_pipeline is None:
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                use_auth_token=token,
                torch_dtype=torch.float16,
                device_map="auto",  # Automatically handles GPU allocation
                trust_remote_code=True
            )
            llama_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
        except Exception as e:
            return f"Error loading Llama pipeline: {e}"
    return llama_pipeline

# ---------------------------------------------------------------------
# Load MusicGen Model (Lazy Loading)
# ---------------------------------------------------------------------
@spaces.GPU(duration=120)
def load_musicgen_model():
    global musicgen_model, musicgen_processor
    if musicgen_model is None or musicgen_processor is None:
        try:
            musicgen_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
            musicgen_processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
        except Exception as e:
            return None, f"Error loading MusicGen model: {e}"
    return musicgen_model, musicgen_processor

# ---------------------------------------------------------------------
# Generate Radio Script
# ---------------------------------------------------------------------
def generate_script(user_input: str, llama_pipeline):
    try:
        system_prompt = (
            "You are a top-tier radio imaging producer using Llama 3. "
            "Take the user's concept and craft a short, creative promo script."
        )
        combined_prompt = f"{system_prompt}\nUser concept: {user_input}\nRefined script:"
        result = llama_pipeline(combined_prompt, max_new_tokens=200, do_sample=True, temperature=0.9)
        return result[0]['generated_text'].split("Refined script:")[-1].strip()
    except Exception as e:
        return f"Error generating script: {e}"

# ---------------------------------------------------------------------
# Generate Audio
# ---------------------------------------------------------------------
@spaces.GPU(duration=120)
def generate_audio(prompt: str, audio_length: int):
    mg_model, mg_processor = load_musicgen_model()
    if mg_model is None or isinstance(mg_processor, str):
        return mg_processor

    try:
        mg_model.to("cuda")  # Move the model to GPU
        inputs = mg_processor(text=[prompt], padding=True, return_tensors="pt")
        outputs = mg_model.generate(**inputs, max_new_tokens=audio_length)
        mg_model.to("cpu")  # Return the model to CPU

        sr = mg_model.config.audio_encoder.sampling_rate
        audio_data = outputs[0, 0].cpu().numpy()
        normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")

        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
            write(temp_wav.name, sr, normalized_audio)
            return temp_wav.name
    except Exception as e:
        return f"Error generating audio: {e}"

# ---------------------------------------------------------------------
# Gradio Interface
# ---------------------------------------------------------------------
def radio_imaging_script(user_prompt, llama_model_id):
    llama_pipeline = load_llama_pipeline_zero_gpu(llama_model_id, hf_token)
    if isinstance(llama_pipeline, str):
        return llama_pipeline

    # Generate Script
    script = generate_script(user_prompt, llama_pipeline)
    return script

def radio_imaging_audio(script, audio_length):
    return generate_audio(script, audio_length)

# ---------------------------------------------------------------------
# Interface
# ---------------------------------------------------------------------
with gr.Blocks() as demo:
    gr.Markdown("# 🎧 AI Radio Imaging with Llama 3 + MusicGen (Zero GPU)")

    # Script Generation Section
    with gr.Row():
        with gr.Column():
            gr.Markdown("## Step 1: Generate the Promo Script")
            user_prompt = gr.Textbox(label="Enter your promo idea", placeholder="E.g., A 15-second hype jingle for a morning talk show.")
            llama_model_id = gr.Textbox(label="Llama 3 Model ID", value="meta-llama/Meta-Llama-3-70B")
            generate_script_button = gr.Button("Generate Promo Script")
            script_output = gr.Textbox(label="Generated Script", interactive=False)

            generate_script_button.click(
                fn=radio_imaging_script,
                inputs=[user_prompt, llama_model_id],
                outputs=script_output
            )

    # Audio Generation Section
    with gr.Row():
        with gr.Column():
            gr.Markdown("## Step 2: Generate the Sound")
            audio_length = gr.Slider(label="Audio Length (tokens)", minimum=128, maximum=1024, step=64, value=512)
            generate_audio_button = gr.Button("Generate Sound from Script")
            audio_output = gr.Audio(label="Generated Audio", type="filepath")

            generate_audio_button.click(
                fn=radio_imaging_audio,
                inputs=[script_output, audio_length],
                outputs=audio_output
            )

demo.launch(debug=True)