Bils commited on
Commit
8d064dc
·
verified ·
1 Parent(s): 7b531cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -102
app.py CHANGED
@@ -45,10 +45,10 @@ class ModelManager:
45
 
46
  def get_llama_pipeline(self, model_id, token):
47
  if model_id not in self.llama_pipelines:
48
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
49
  model = AutoModelForCausalLM.from_pretrained(
50
  model_id,
51
- token=token,
52
  torch_dtype=torch.float16,
53
  device_map="auto",
54
  attn_implementation="flash_attention_2"
@@ -85,20 +85,17 @@ def generate_script(user_prompt, model_id, duration, temperature=0.7, max_tokens
85
  try:
86
  text_pipeline = model_manager.get_llama_pipeline(model_id, HF_TOKEN)
87
 
88
- system_prompt = f"""You are an AI audio production assistant. Create content for a {duration}-second promo:
89
- 1. Voice Script: [Clear, engaging narration]
90
- 2. Sound Design: [3-5 specific sound effects]
91
- 3. Music: [Genre, tempo, mood suggestions]
92
 
93
- Keep sections concise and production-ready."""
94
-
95
- messages = [
96
- {"role": "system", "content": system_prompt},
97
- {"role": "user", "content": user_prompt}
98
- ]
99
 
 
 
100
  response = text_pipeline(
101
- messages,
102
  max_new_tokens=max_tokens,
103
  temperature=temperature,
104
  do_sample=True,
@@ -106,8 +103,7 @@ Keep sections concise and production-ready."""
106
  eos_token_id=text_pipeline.tokenizer.eos_token_id
107
  )
108
 
109
- return parse_generated_content(response[0]['generated_text'][-1]['content'])
110
-
111
  except Exception as e:
112
  return f"Error: {str(e)}", "", ""
113
 
@@ -140,10 +136,10 @@ def parse_generated_content(text):
140
  def generate_voice(script, tts_model, speed=1.0):
141
  try:
142
  if not script.strip():
143
- raise ValueError("Empty script")
144
 
145
  tts = model_manager.get_tts_model(tts_model)
146
- output_path = os.path.join(tempfile.gettempdir(), "enhanced_voice.wav")
147
 
148
  tts.tts_to_file(
149
  text=script,
@@ -174,8 +170,8 @@ def generate_music(prompt, duration_sec=30, temperature=1.0, guidance_scale=3.0)
174
  do_sample=True
175
  )
176
 
177
- output_path = os.path.join(tempfile.gettempdir(), "enhanced_music.wav")
178
- write(output_path, 32000, audio_values[0, 0].cpu().numpy())
179
  return output_path
180
  except Exception as e:
181
  return f"Error: {str(e)}"
@@ -185,19 +181,22 @@ def blend_audio(voice_path, music_path, ducking=True, duck_level=10, crossfade=5
185
  voice = AudioSegment.from_wav(voice_path)
186
  music = AudioSegment.from_wav(music_path)
187
 
 
188
  if len(music) < len(voice):
189
  loops = (len(voice) // len(music)) + 1
190
  music = music * loops
191
 
192
  music = music[:len(voice)].fade_out(crossfade)
 
193
 
 
194
  if ducking:
195
  ducked_music = music - duck_level
196
- mixed = ducked_music.overlay(voice.fade_in(crossfade))
197
  else:
198
  mixed = music.overlay(voice)
199
 
200
- output_path = os.path.join(tempfile.gettempdir(), "enhanced_mix.wav")
201
  mixed.export(output_path, format="wav")
202
  return output_path
203
  except Exception as e:
@@ -214,124 +213,119 @@ theme = gr.themes.Soft(
214
  background_fill_primary_dark='#1F1F1F'
215
  )
216
 
217
- with gr.Blocks(theme=theme, title="AI Audio Studio Pro") as demo:
218
  gr.Markdown("""
219
- # 🎙️ AI Audio Studio Pro
220
- *Next-generation audio production powered by AI*
221
  """)
222
-
223
  with gr.Tabs():
224
- with gr.Tab("🎯 Concept Development"):
 
225
  with gr.Row():
226
  with gr.Column(scale=2):
227
  concept_input = gr.Textbox(
228
- label="Your Concept",
229
- placeholder="Describe your audio project...",
230
- lines=3,
231
- max_lines=6
232
  )
233
  with gr.Accordion("Advanced Settings", open=False):
234
- with gr.Row():
235
- model_selector = gr.Dropdown(
236
- choices=list(MODEL_CONFIG["llama_models"].values()),
237
- label="AI Model",
238
- value=MODEL_CONFIG["llama_models"]["Meta-Llama-3-8B"]
239
- )
240
- duration_slider = gr.Slider(15, 120, value=30, step=15, label="Duration (seconds)")
241
- with gr.Row():
242
- temp_slider = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Creativity")
243
- token_slider = gr.Slider(128, 1024, value=512, step=128, label="Max Length")
244
-
245
- generate_btn = gr.Button("✨ Generate Concept", variant="primary")
246
 
247
  with gr.Column(scale=1):
248
  script_output = gr.Textbox(label="Voice Script", interactive=True)
249
  sound_output = gr.Textbox(label="Sound Design", interactive=True)
250
- music_output = gr.Textbox(label="Music Suggestions", interactive=True)
251
-
252
- generate_btn.click(
253
- generate_script,
254
- inputs=[concept_input, model_selector, duration_slider, temp_slider, token_slider],
255
- outputs=[script_output, sound_output, music_output]
256
- )
257
 
258
- with gr.Tab("🗣️ Voice Production"):
 
259
  with gr.Row():
260
  with gr.Column():
261
- tts_model = gr.Dropdown(
262
  choices=list(MODEL_CONFIG["tts_models"].values()),
263
  label="Voice Model",
264
- value=MODEL_CONFIG["tts_models"]["Standard English"]
265
  )
266
- speed_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Speaking Rate")
267
- voice_btn = gr.Button("🎙️ Generate Voiceover", variant="primary")
268
  with gr.Column():
269
- voice_preview = gr.Audio(label="Preview", interactive=False)
270
- voice_btn.click(
271
- generate_voice,
272
- inputs=[script_output, tts_model, speed_slider],
273
- outputs=voice_preview
274
- )
275
 
276
- with gr.Tab("🎶 Music Production"):
 
277
  with gr.Row():
278
  with gr.Column():
279
- with gr.Accordion("Music Parameters", open=True):
280
- music_duration = gr.Slider(10, 120, value=30, label="Duration (seconds)")
281
- music_temp = gr.Slider(0.1, 2.0, value=1.0, label="Creativity")
282
- guidance_scale = gr.Slider(1.0, 5.0, value=3.0, label="Focus")
283
- music_btn = gr.Button("🎵 Generate Music", variant="primary")
284
  with gr.Column():
285
- music_preview = gr.Audio(label="Preview", interactive=False)
286
- music_btn.click(
287
- generate_music,
288
- inputs=[music_output, music_duration, music_temp, guidance_scale],
289
- outputs=music_preview
290
- )
291
 
292
- with gr.Tab("🔊 Final Mix"):
 
293
  with gr.Row():
294
  with gr.Column():
295
- ducking_toggle = gr.Checkbox(value=True, label="Enable Voice Ducking")
296
- duck_level = gr.Slider(0, 30, value=12, label="Ducking Strength (dB)")
297
- crossfade_time = gr.Slider(0, 2000, value=500, label="Crossfade (ms)")
298
- mix_btn = gr.Button("🚀 Create Final Mix", variant="primary")
299
  with gr.Column():
300
- final_mix = gr.Audio(label="Master Output", interactive=False)
301
- mix_btn.click(
302
- blend_audio,
303
- inputs=[voice_preview, music_preview, ducking_toggle, duck_level, crossfade_time],
304
- outputs=final_mix
305
- )
306
-
307
- with gr.Accordion("📚 Example Prompts", open=False):
308
  gr.Examples(
309
  examples=[
310
- ["A 30-second tech podcast intro with futuristic sounds"],
311
- ["A 15-second radio ad for a coffee shop with morning vibes"],
312
  ["A 60-second documentary trailer with epic orchestral music"]
313
  ],
314
  inputs=concept_input
315
  )
316
-
317
- with gr.Row():
318
- gr.Markdown("### System Resources")
319
- gpu_status = gr.Textbox(label="GPU Utilization", interactive=False)
320
- ram_status = gr.Textbox(label="RAM Usage", interactive=False)
321
 
322
- # Custom Footer
323
  gr.Markdown("""
324
- <hr>
325
- <p style="text-align: center; font-size: 0.9em;">
326
- Created with ❤️ by <a href="https://bilsimaging.com" target="_blank">bilsimaging.com</a>
327
- </p>
 
 
 
 
328
  """)
 
 
 
 
 
 
 
329
 
330
- gr.HTML("""
331
- <a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold">
332
- <img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold&countColor=%23263759" />
333
- </a>
334
- """)
 
 
 
 
 
 
 
 
 
 
 
 
335
 
336
  if __name__ == "__main__":
337
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
45
 
46
  def get_llama_pipeline(self, model_id, token):
47
  if model_id not in self.llama_pipelines:
48
+ tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
49
  model = AutoModelForCausalLM.from_pretrained(
50
  model_id,
51
+ use_auth_token=token,
52
  torch_dtype=torch.float16,
53
  device_map="auto",
54
  attn_implementation="flash_attention_2"
 
85
  try:
86
  text_pipeline = model_manager.get_llama_pipeline(model_id, HF_TOKEN)
87
 
88
+ system_prompt = f"""You are an expert radio imaging producer. Create content for a {duration}-second promo:
89
+ 1. Voice Script: [Clear narration]
90
+ 2. Sound Design: [3-5 effects]
91
+ 3. Music: [Genre/tempo/mood]
92
 
93
+ Respond in this exact format:"""
 
 
 
 
 
94
 
95
+ prompt = f"{system_prompt}\nConcept: {user_prompt}\nVoice Script:"
96
+
97
  response = text_pipeline(
98
+ prompt,
99
  max_new_tokens=max_tokens,
100
  temperature=temperature,
101
  do_sample=True,
 
103
  eos_token_id=text_pipeline.tokenizer.eos_token_id
104
  )
105
 
106
+ return parse_generated_content(response[0]["generated_text"])
 
107
  except Exception as e:
108
  return f"Error: {str(e)}", "", ""
109
 
 
136
  def generate_voice(script, tts_model, speed=1.0):
137
  try:
138
  if not script.strip():
139
+ return "Error: Empty script"
140
 
141
  tts = model_manager.get_tts_model(tts_model)
142
+ output_path = os.path.join(tempfile.gettempdir(), "voice.wav")
143
 
144
  tts.tts_to_file(
145
  text=script,
 
170
  do_sample=True
171
  )
172
 
173
+ output_path = os.path.join(tempfile.gettempdir(), "music.wav")
174
+ write(output_path, 44100, audio_values[0, 0].cpu().numpy())
175
  return output_path
176
  except Exception as e:
177
  return f"Error: {str(e)}"
 
181
  voice = AudioSegment.from_wav(voice_path)
182
  music = AudioSegment.from_wav(music_path)
183
 
184
+ # Align durations with crossfade
185
  if len(music) < len(voice):
186
  loops = (len(voice) // len(music)) + 1
187
  music = music * loops
188
 
189
  music = music[:len(voice)].fade_out(crossfade)
190
+ voice = voice.fade_in(crossfade)
191
 
192
+ # Apply ducking
193
  if ducking:
194
  ducked_music = music - duck_level
195
+ mixed = ducked_music.overlay(voice)
196
  else:
197
  mixed = music.overlay(voice)
198
 
199
+ output_path = os.path.join(tempfile.gettempdir(), "final_mix.wav")
200
  mixed.export(output_path, format="wav")
201
  return output_path
202
  except Exception as e:
 
213
  background_fill_primary_dark='#1F1F1F'
214
  )
215
 
216
+ with gr.Blocks(theme=theme, title="AI Radio Studio Pro") as demo:
217
  gr.Markdown("""
218
+ # 🎧 AI Radio Studio Pro
219
+ *Professional Audio Production in 4 Steps*
220
  """)
221
+
222
  with gr.Tabs():
223
+ # Step 1: Concept Development
224
+ with gr.Tab("1️⃣ Concept"):
225
  with gr.Row():
226
  with gr.Column(scale=2):
227
  concept_input = gr.Textbox(
228
+ label="Your Idea",
229
+ placeholder="e.g., A 30-second morning show intro with energetic music...",
230
+ lines=3
 
231
  )
232
  with gr.Accordion("Advanced Settings", open=False):
233
+ model_selector = gr.Dropdown(
234
+ choices=list(MODEL_CONFIG["llama_models"].values()),
235
+ label="AI Model",
236
+ value=MODEL_CONFIG["llama_models"]["Meta-Llama-3-8B"]
237
+ )
238
+ duration_slider = gr.Slider(15, 120, 30, step=15, label="Duration (seconds)")
239
+ temp_slider = gr.Slider(0.1, 1.5, 0.7, step=0.1, label="Creativity")
240
+ generate_btn = gr.Button("Generate Script", variant="primary")
 
 
 
 
241
 
242
  with gr.Column(scale=1):
243
  script_output = gr.Textbox(label="Voice Script", interactive=True)
244
  sound_output = gr.Textbox(label="Sound Design", interactive=True)
245
+ music_output = gr.Textbox(label="Music Style", interactive=True)
 
 
 
 
 
 
246
 
247
+ # Step 2: Voice Production
248
+ with gr.Tab("2️⃣ Voice"):
249
  with gr.Row():
250
  with gr.Column():
251
+ tts_selector = gr.Dropdown(
252
  choices=list(MODEL_CONFIG["tts_models"].values()),
253
  label="Voice Model",
254
+ value="tts_models/en/ljspeech/tacotron2-DDC"
255
  )
256
+ speed_slider = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Speaking Rate")
257
+ voice_btn = gr.Button("Generate Voiceover", variant="primary")
258
  with gr.Column():
259
+ voice_preview = gr.Audio(label="Preview", type="filepath")
 
 
 
 
 
260
 
261
+ # Step 3: Music Production
262
+ with gr.Tab("3️⃣ Music"):
263
  with gr.Row():
264
  with gr.Column():
265
+ music_duration = gr.Slider(10, 120, 30, label="Duration (seconds)")
266
+ music_temp = gr.Slider(0.1, 2.0, 1.0, label="Creativity")
267
+ guidance_scale = gr.Slider(1.0, 5.0, 3.0, label="Focus")
268
+ music_btn = gr.Button("Generate Music", variant="primary")
 
269
  with gr.Column():
270
+ music_preview = gr.Audio(label="Preview", type="filepath")
 
 
 
 
 
271
 
272
+ # Step 4: Final Mix
273
+ with gr.Tab("4️⃣ Mix"):
274
  with gr.Row():
275
  with gr.Column():
276
+ ducking_toggle = gr.Checkbox(True, label="Enable Voice Ducking")
277
+ duck_level = gr.Slider(0, 30, 12, label="Ducking Strength (dB)")
278
+ crossfade_time = gr.Slider(0, 2000, 500, label="Crossfade (ms)")
279
+ mix_btn = gr.Button("Create Final Mix", variant="primary")
280
  with gr.Column():
281
+ final_mix = gr.Audio(label="Master Output", type="filepath")
282
+
283
+ # Examples & Footer
284
+ with gr.Accordion("💡 Example Prompts", open=False):
 
 
 
 
285
  gr.Examples(
286
  examples=[
287
+ ["A 45-second tech podcast intro with futuristic synth effects"],
288
+ ["A 15-second coffee shop radio ad with morning acoustic vibes"],
289
  ["A 60-second documentary trailer with epic orchestral music"]
290
  ],
291
  inputs=concept_input
292
  )
 
 
 
 
 
293
 
 
294
  gr.Markdown("""
295
+ <div style="text-align: center; margin-top: 30px; padding-top: 20px; border-top: 1px solid #444;">
296
+ <p style="font-size: 0.9em; color: #888;">
297
+ Created with ❤️ by <a href="https://bilsimaging.com" target="_blank" style="color: #66b3ff;">bilsimaging.com</a>
298
+ </p>
299
+ <a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold">
300
+ <img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold&countColor=%23263759"/>
301
+ </a>
302
+ </div>
303
  """)
304
+
305
+ # Event Handling
306
+ generate_btn.click(
307
+ generate_script,
308
+ inputs=[concept_input, model_selector, duration_slider, temp_slider],
309
+ outputs=[script_output, sound_output, music_output]
310
+ )
311
 
312
+ voice_btn.click(
313
+ generate_voice,
314
+ inputs=[script_output, tts_selector, speed_slider],
315
+ outputs=voice_preview
316
+ )
317
+
318
+ music_btn.click(
319
+ generate_music,
320
+ inputs=[music_output, music_duration, music_temp, guidance_scale],
321
+ outputs=music_preview
322
+ )
323
+
324
+ mix_btn.click(
325
+ blend_audio,
326
+ inputs=[voice_preview, music_preview, ducking_toggle, duck_level, crossfade_time],
327
+ outputs=final_mix
328
+ )
329
 
330
  if __name__ == "__main__":
331
  demo.launch(server_name="0.0.0.0", server_port=7860)