File size: 5,716 Bytes
17d10a7
a15d204
d448add
db46bfb
 
 
 
 
 
 
cf3593c
 
 
dfa5d3e
c243adb
dfa5d3e
cf3593c
 
 
dfa5d3e
 
 
cf3593c
f0b5707
613bd9e
 
 
 
 
f0b5707
dfa5d3e
7bbdf94
613bd9e
f0b5707
613bd9e
cf3593c
d0384c8
dfa5d3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf3593c
17d10a7
 
dfa5d3e
17d10a7
 
dfa5d3e
cf3593c
17d10a7
 
 
d448add
cf3593c
 
 
 
 
d448add
dfa5d3e
 
 
b50e3e1
dfa5d3e
 
 
b50e3e1
dfa5d3e
 
 
b50e3e1
dfa5d3e
b50e3e1
dfa5d3e
 
 
b50e3e1
dfa5d3e
 
 
b50e3e1
dfa5d3e
 
 
 
17d10a7
f0b5707
b50e3e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3fe530b
7bbdf94
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import gradio as gr
import os
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    pipeline,
    AutoProcessor, 
    MusicgenForConditionalGeneration
)
from scipy.io.wavfile import write
import tempfile
from dotenv import load_dotenv
import spaces  # Assumes Hugging Face Spaces library supports `@spaces.GPU`

# Load environment variables (e.g., Hugging Face token)
load_dotenv()
hf_token = os.getenv("HF_TOKEN")

# ---------------------------------------------------------------------
# Load Llama 3 Model with Zero GPU
# ---------------------------------------------------------------------
@spaces.GPU(duration=120)
def load_llama_pipeline_zero_gpu(model_id: str, token: str):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            use_auth_token=token,
            torch_dtype=torch.float16,
            device_map="auto",  # Automatically handles GPU allocation
            trust_remote_code=True
        )
        return pipeline("text-generation", model=model, tokenizer=tokenizer)
    except Exception as e:
        return str(e)

# ---------------------------------------------------------------------
# Generate Radio Script
# ---------------------------------------------------------------------
def generate_script(user_input: str, pipeline_llama):
    try:
        system_prompt = (
            "You are a top-tier radio imaging producer using Llama 3. "
            "Take the user's concept and craft a short, creative promo script."
        )
        combined_prompt = f"{system_prompt}\nUser concept: {user_input}\nRefined script:"
        result = pipeline_llama(combined_prompt, max_new_tokens=200, do_sample=True, temperature=0.9)
        return result[0]['generated_text'].split("Refined script:")[-1].strip()
    except Exception as e:
        return f"Error generating script: {e}"

# ---------------------------------------------------------------------
# Load MusicGen Model
# ---------------------------------------------------------------------
@spaces.GPU(duration=120)
def load_musicgen_model():
    try:
        model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
        processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
        return model, processor
    except Exception as e:
        return None, str(e)

# ---------------------------------------------------------------------
# Generate Audio
# ---------------------------------------------------------------------
@spaces.GPU(duration=120)
def generate_audio(prompt: str, audio_length: int, mg_model, mg_processor):
    try:
        mg_model.to("cuda")  # Move the model to GPU
        inputs = mg_processor(text=[prompt], padding=True, return_tensors="pt")
        outputs = mg_model.generate(**inputs, max_new_tokens=audio_length)
        mg_model.to("cpu")  # Return the model to CPU

        sr = mg_model.config.audio_encoder.sampling_rate
        audio_data = outputs[0, 0].cpu().numpy()
        normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")

        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
            write(temp_wav.name, sr, normalized_audio)
            return temp_wav.name
    except Exception as e:
        return f"Error generating audio: {e}"

# ---------------------------------------------------------------------
# Gradio Interface
# ---------------------------------------------------------------------
def radio_imaging_script(user_prompt, llama_model_id):
    # Load Llama 3 Pipeline with Zero GPU
    pipeline_llama = load_llama_pipeline_zero_gpu(llama_model_id, hf_token)
    if isinstance(pipeline_llama, str):
        return pipeline_llama

    # Generate Script
    script = generate_script(user_prompt, pipeline_llama)
    return script

def radio_imaging_audio(script, audio_length):
    # Load MusicGen
    mg_model, mg_processor = load_musicgen_model()
    if isinstance(mg_processor, str):
        return mg_processor

    # Generate Audio
    audio_data = generate_audio(script, audio_length, mg_model, mg_processor)
    return audio_data

# ---------------------------------------------------------------------
# Interface
# ---------------------------------------------------------------------
with gr.Blocks() as demo:
    gr.Markdown("# 🎧 AI Radio Imaging with Llama 3 + MusicGen (Zero GPU)")

    # Script Generation Section
    with gr.Box():
        gr.Markdown("## Step 1: Generate the Promo Script")
        user_prompt = gr.Textbox(label="Enter your promo idea", placeholder="E.g., A 15-second hype jingle for a morning talk show.")
        llama_model_id = gr.Textbox(label="Llama 3 Model ID", value="meta-llama/Meta-Llama-3-70B")
        generate_script_button = gr.Button("Generate Promo Script")
        script_output = gr.Textbox(label="Generated Script", interactive=False)

        generate_script_button.click(
            fn=radio_imaging_script,
            inputs=[user_prompt, llama_model_id],
            outputs=script_output
        )

    # Audio Generation Section
    with gr.Box():
        gr.Markdown("## Step 2: Generate the Sound")
        audio_length = gr.Slider(label="Audio Length (tokens)", minimum=128, maximum=1024, step=64, value=512)
        generate_audio_button = gr.Button("Generate Sound from Script")
        audio_output = gr.Audio(label="Generated Audio", type="filepath")

        generate_audio_button.click(
            fn=radio_imaging_audio,
            inputs=[script_output, audio_length],
            outputs=audio_output
        )

demo.launch(debug=True)