File size: 11,218 Bytes
17d10a7
a15d204
d448add
ab6cd42
 
db46bfb
1c1b50f
 
db46bfb
1c1b50f
db8ba25
db46bfb
cf3593c
d9bf0f0
b950350
6aba99a
3168a3e
019c404
ab6cd42
 
3168a3e
ab6cd42
 
 
cf3593c
ab6cd42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df2a904
ab6cd42
5607a62
 
 
 
 
 
ab6cd42
 
8bda130
 
df2a904
5607a62
8bda130
ab6cd42
 
df2a904
ab6cd42
5607a62
 
ab6cd42
 
 
 
 
 
 
 
5607a62
ab6cd42
 
 
 
 
5607a62
ab6cd42
5607a62
0105281
ab6cd42
0105281
ab6cd42
 
 
 
 
dfa5d3e
ab6cd42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3168a3e
2de59b3
ab6cd42
 
b950350
ab6cd42
2de59b3
ab6cd42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0105281
ab6cd42
 
b950350
ab6cd42
559ca26
ab6cd42
 
 
 
 
 
 
 
 
dfa5d3e
ab6cd42
0105281
ab6cd42
 
17d10a7
ab6cd42
 
 
 
 
 
 
 
 
 
 
 
 
cf3593c
ab6cd42
0105281
ab6cd42
ecc69bf
ab6cd42
559ca26
 
0105281
ab6cd42
 
 
 
0105281
ab6cd42
559ca26
ab6cd42
0105281
ab6cd42
 
 
 
d9bf0f0
ab6cd42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0105281
ab6cd42
 
 
 
 
 
 
 
0105281
ab6cd42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d064dc
35e8eba
ab6cd42
464b686
ab6cd42
 
 
 
0105281
ab6cd42
 
 
 
 
0105281
ab6cd42
 
 
 
 
0105281
ab6cd42
 
 
 
 
 
 
 
0105281
ab6cd42
 
7b531cd
 
ab6cd42
 
 
 
7b531cd
ab6cd42
 
 
 
 
 
 
8d064dc
ab6cd42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3fe530b
ab6cd42
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
import gradio as gr
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    AutoProcessor,
    MusicgenForConditionalGeneration,
)
from scipy.io.wavfile import write
from pydub import AudioSegment
from dotenv import load_dotenv
import tempfile
import spaces
from TTS.api import TTS
import psutil
import GPUtil

# -------------------------------
# Configuration
# -------------------------------
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN", os.getenv("HF_TOKEN_SECRET"))

MODEL_CONFIG = {
    "llama_models": {
        "Meta-Llama-3-8B": "meta-llama/Meta-Llama-3-8B-Instruct",
        "Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.2",
    },
    "tts_models": {
        "Standard English": "tts_models/en/ljspeech/tacotron2-DDC",
        "High Quality": "tts_models/en/ljspeech/vits"
    },
    "musicgen_model": "facebook/musicgen-medium"
}

# -------------------------------
# Model Manager with Cache
# -------------------------------

class ModelManager:
    def __init__(self):
        self.llama_pipelines = {}
        self.musicgen_model = None
        self.tts_models = {}
        self.processor = None  # Add processor cache

    def get_llama_pipeline(self, model_id, token):
        if model_id not in self.llama_pipelines:
            tokenizer = AutoTokenizer.from_pretrained(
                model_id,
                token=token,
                legacy=False
            )
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                token=token,
                torch_dtype=torch.float16,
                device_map="auto",
                low_cpu_mem_usage=True
            )
            self.llama_pipelines[model_id] = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
                device_map="auto"
            )
        return self.llama_pipelines[model_id]

    def get_musicgen_model(self):
        if not self.musicgen_model:
            self.musicgen_model = MusicgenForConditionalGeneration.from_pretrained(
                MODEL_CONFIG["musicgen_model"]
            )
            self.processor = AutoProcessor.from_pretrained(MODEL_CONFIG["musicgen_model"])
            self.musicgen_model.to("cuda" if torch.cuda.is_available() else "cpu")
        return self.musicgen_model, self.processor

model_manager = ModelManager()

# -------------------------------
# Core Functions with Enhanced Error Handling
# -------------------------------
@spaces.GPU
def generate_script(user_prompt, model_id, duration, progress=gr.Progress()):
    try:
        progress(0.1, "Initializing script generation...")
        text_pipeline = model_manager.get_llama_pipeline(model_id, HF_TOKEN)
        
        system_prompt = f"""Generate a {duration}-second radio promo with:
1. Voice Script: [Clear narration, 25-35 words]
2. Sound Design: [3-5 specific sound effects]
3. Music: [Genre, tempo, mood]

Format strictly as:
Voice Script: [content]
Sound Design: [effects]
Music: [description]"""

        progress(0.3, "Generating content...")
        response = text_pipeline(
            f"{system_prompt}\nConcept: {user_prompt}",
            max_new_tokens=300,
            temperature=0.7,
            do_sample=True,
            top_p=0.95
        )

        progress(0.8, "Parsing results...")
        return parse_generated_content(response[0]["generated_text"])
    except Exception as e:
        return [f"Error: {str(e)}"] * 3

def parse_generated_content(text):
    sections = {"Voice Script": "", "Sound Design": "", "Music": ""}
    current_section = None
    
    for line in text.split('\n'):
        line = line.strip()
        for section in sections:
            if line.startswith(section + ":"):
                current_section = section
                line = line.replace(section + ":", "").strip()
                break
        if current_section and line:
            sections[current_section] += line + "\n"
    
    return [sections[section].strip() for section in sections]

@spaces.GPU
def generate_voice(script, tts_model, speed=1.0, progress=gr.Progress()):
    try:
        progress(0.2, "Initializing TTS...")
        if not script.strip():
            return None, "No script provided"
            
        tts = model_manager.get_tts_model(tts_model)
        output_path = os.path.join(tempfile.gettempdir(), "voice.wav")
        
        progress(0.5, "Generating audio...")
        tts.tts_to_file(text=script, file_path=output_path, speed=speed)
        
        return output_path, None
    except Exception as e:
        return None, f"Voice Error: {str(e)}"

@spaces.GPU
def generate_music(prompt, duration_sec=30, progress=gr.Progress()):
    try:
        progress(0.1, "Initializing MusicGen...")
        model = model_manager.get_musicgen_model()
        processor = AutoProcessor.from_pretrained(MODEL_CONFIG["musicgen_model"])
        
        progress(0.4, "Processing input...")
        inputs = processor(text=[prompt], padding=True, return_tensors="pt").to(model.device)

        progress(0.6, "Generating music...")
        audio_values = model.generate(**inputs, max_new_tokens=int(duration_sec * 50))
        
        output_path = os.path.join(tempfile.gettempdir(), "music.wav")
        write(output_path, 32000, audio_values[0, 0].cpu().numpy())
        return output_path, None
    except Exception as e:
        return None, f"Music Error: {str(e)}"

def blend_audio(voice_path, music_path, ducking=True, progress=gr.Progress()):
    try:
        progress(0.2, "Loading audio files...")
        voice = AudioSegment.from_wav(voice_path)
        music = AudioSegment.from_wav(music_path)

        progress(0.4, "Aligning durations...")
        if len(music) < len(voice):
            music = music * (len(voice) // len(music) + 1)
        music = music[:len(voice)]

        progress(0.6, "Mixing audio...")
        if ducking:
            music = music - 10  # 10dB ducking

        mixed = music.overlay(voice)
        output_path = os.path.join(tempfile.gettempdir(), "final_mix.wav")
        mixed.export(output_path, format="wav")
        return output_path, None
    except Exception as e:
        return None, f"Mixing Error: {str(e)}"

# -------------------------------
# UI Components
# -------------------------------
def create_audio_visualization(audio_path):
    if not audio_path:
        return None
    audio = AudioSegment.from_file(audio_path)
    samples = np.array(audio.get_array_of_samples())
    
    plt.figure(figsize=(10, 3))
    plt.plot(samples)
    plt.axis('off')
    plt.tight_layout()
    
    temp_file = os.path.join(tempfile.gettempdir(), "waveform.png")
    plt.savefig(temp_file, bbox_inches='tight', pad_inches=0)
    plt.close()
    return temp_file

def system_monitor():
    gpus = GPUtil.getGPUs()
    return {
        "CPU": f"{psutil.cpu_percent()}%",
        "RAM": f"{psutil.virtual_memory().percent}%",
        "GPU": f"{gpus[0].load*100 if gpus else 0:.1f}%" if gpus else "N/A"
    }

# -------------------------------
# Gradio Interface
# -------------------------------
theme = gr.themes.Soft(
    primary_hue="blue",
    secondary_hue="teal",
).set(
    body_text_color_dark='#FFFFFF',
    background_fill_primary_dark='#1F1F1F'
)

with gr.Blocks(theme=theme, title="AI Radio Studio Pro") as demo:
    gr.Markdown("# πŸŽ™οΈ AI Radio Studio Pro")
    
    with gr.Row():
        with gr.Column(scale=3):
            concept_input = gr.Textbox(
                label="Concept Description",
                placeholder="Describe your radio segment...",
                lines=3
            )
            with gr.Accordion("Advanced Settings", open=False):
                model_selector = gr.Dropdown(
                    list(MODEL_CONFIG["llama_models"].values()),
                    label="AI Model",
                    value=next(iter(MODEL_CONFIG["llama_models"].values()))
                )
                duration_selector = gr.Slider(15, 120, 30, step=15, label="Duration (seconds)")
                
            generate_btn = gr.Button("Generate Script", variant="primary")
            
        with gr.Column(scale=2):
            script_output = gr.Textbox(label="Voice Script", interactive=True)
            sound_output = gr.Textbox(label="Sound Design", interactive=True)
            music_output = gr.Textbox(label="Music Style", interactive=True)

    with gr.Tabs():
        with gr.Tab("🎀 Voice Production"):
            with gr.Row():
                tts_selector = gr.Dropdown(
                    list(MODEL_CONFIG["tts_models"].values()),
                    label="Voice Model",
                    value=next(iter(MODEL_CONFIG["tts_models"].values()))
                )
                speed_selector = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Speaking Rate")
            voice_btn = gr.Button("Generate Voiceover", variant="primary")
            with gr.Row():
                voice_audio = gr.Audio(label="Voice Preview", interactive=False)
                voice_viz = gr.Image(label="Waveform", interactive=False)

        with gr.Tab("🎡 Music Production"):
            music_btn = gr.Button("Generate Music Track", variant="primary")
            with gr.Row():
                music_audio = gr.Audio(label="Music Preview", interactive=False)
                music_viz = gr.Image(label="Waveform", interactive=False)

        with gr.Tab("πŸ”‰ Final Mix"):
            mix_btn = gr.Button("Create Final Mix", variant="primary")
            with gr.Row():
                final_mix_audio = gr.Audio(label="Final Mix", interactive=False)
                final_mix_viz = gr.Image(label="Waveform", interactive=False)
            with gr.Row():
                download_btn = gr.Button("Download Mix")
                play_btn = gr.Button("▢️ Play in Browser")

    with gr.Accordion("πŸ“Š System Monitor", open=False):
        monitor = gr.JSON(label="Resource Usage", value=lambda: system_monitor(), every=5)

    gr.Markdown("""
    <div style="text-align: center; padding: 20px; border-top: 1px solid #444;">
        <p>Created with ❀️ by <a href="https://bilsimaging.com">Bils Imaging</a></p>
        <img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/radiogold&countColor=%23263759">
    </div>
    """)

    # Event Handling
    generate_btn.click(
        generate_script,
        [concept_input, model_selector, duration_selector],
        [script_output, sound_output, music_output]
    )
    
    voice_btn.click(
        generate_voice,
        [script_output, tts_selector, speed_selector],
        [voice_audio, voice_viz],
        preprocess=create_audio_visualization
    )
    
    music_btn.click(
        generate_music,
        [music_output],
        [music_audio, music_viz],
        preprocess=create_audio_visualization
    )
    
    mix_btn.click(
        blend_audio,
        [voice_audio, music_audio],
        [final_mix_audio, final_mix_viz],
        preprocess=create_audio_visualization
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)