Bils commited on
Commit
9a49723
·
verified ·
1 Parent(s): d45e0a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -73
app.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  import re
3
  import torch
4
  import tempfile
 
5
  from scipy.io.wavfile import write
6
  from pydub import AudioSegment
7
  from dotenv import load_dotenv
@@ -20,8 +21,9 @@ from transformers import (
20
  from TTS.api import TTS
21
 
22
  # ---------------------------------------------------------------------
23
- # Load Environment Variables
24
  # ---------------------------------------------------------------------
 
25
  load_dotenv()
26
  HF_TOKEN = os.getenv("HF_TOKEN")
27
 
@@ -39,15 +41,14 @@ def clean_text(text: str) -> str:
39
  """
40
  Removes undesired characters (e.g., asterisks) that might not be recognized by the model's vocabulary.
41
  """
42
- # Remove all asterisks. You can add more cleaning steps here as needed.
43
  return re.sub(r'\*', '', text)
44
 
45
  # ---------------------------------------------------------------------
46
- # Helper Functions
47
  # ---------------------------------------------------------------------
48
  def get_llama_pipeline(model_id: str, token: str):
49
  """
50
- Returns a cached LLaMA pipeline if available; otherwise, loads it.
51
  """
52
  if model_id in LLAMA_PIPELINES:
53
  return LLAMA_PIPELINES[model_id]
@@ -67,7 +68,7 @@ def get_llama_pipeline(model_id: str, token: str):
67
 
68
  def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
69
  """
70
- Returns a cached MusicGen model if available; otherwise, loads it.
71
  Uses the 'large' variant for higher quality outputs.
72
  """
73
  if model_key in MUSICGEN_MODELS:
@@ -75,7 +76,6 @@ def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
75
 
76
  model = MusicgenForConditionalGeneration.from_pretrained(model_key)
77
  processor = AutoProcessor.from_pretrained(model_key)
78
-
79
  device = "cuda" if torch.cuda.is_available() else "cpu"
80
  model.to(device)
81
  MUSICGEN_MODELS[model_key] = (model, processor)
@@ -84,7 +84,7 @@ def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
84
 
85
  def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
86
  """
87
- Returns a cached TTS model if available; otherwise, loads it.
88
  """
89
  if model_name in TTS_MODELS:
90
  return TTS_MODELS[model_name]
@@ -100,18 +100,18 @@ def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
100
  @spaces.GPU(duration=100)
101
  def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
102
  """
103
- Generates a script, sound design suggestions, and music ideas from a user prompt.
104
- Returns a tuple of strings: (voice_script, sound_design, music_suggestions).
105
  """
106
  try:
107
  text_pipeline = get_llama_pipeline(model_id, token)
108
 
109
  system_prompt = (
110
  "You are an expert radio imaging producer specializing in sound design and music. "
111
- f"Based on the user's concept and the selected duration of {duration} seconds, produce the following: "
112
- "1. A concise voice-over script. Prefix this section with 'Voice-Over Script:'.\n"
113
- "2. Suggestions for sound design. Prefix this section with 'Sound Design Suggestions:'.\n"
114
- "3. Music styles or track recommendations. Prefix this section with 'Music Suggestions:'."
115
  )
116
  combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nOutput:"
117
 
@@ -127,37 +127,20 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
127
  if "Output:" in generated_text:
128
  generated_text = generated_text.split("Output:")[-1].strip()
129
 
130
- # Default placeholders
131
- voice_script = "No voice-over script found."
132
- sound_design = "No sound design suggestions found."
133
- music_suggestions = "No music suggestions found."
134
-
135
- # Voice-Over Script
136
- if "Voice-Over Script:" in generated_text:
137
- parts = generated_text.split("Voice-Over Script:")
138
- voice_script_part = parts[1]
139
- if "Sound Design Suggestions:" in voice_script_part:
140
- voice_script = voice_script_part.split("Sound Design Suggestions:")[0].strip()
141
- else:
142
- voice_script = voice_script_part.strip()
143
-
144
- # Sound Design
145
- if "Sound Design Suggestions:" in generated_text:
146
- parts = generated_text.split("Sound Design Suggestions:")
147
- sound_design_part = parts[1]
148
- if "Music Suggestions:" in sound_design_part:
149
- sound_design = sound_design_part.split("Music Suggestions:")[0].strip()
150
- else:
151
- sound_design = sound_design_part.strip()
152
-
153
- # Music Suggestions
154
- if "Music Suggestions:" in generated_text:
155
- parts = generated_text.split("Music Suggestions:")
156
- music_suggestions = parts[1].strip()
157
 
158
  return voice_script, sound_design, music_suggestions
159
 
160
  except Exception as e:
 
161
  return f"Error generating script: {e}", "", ""
162
 
163
 
@@ -167,24 +150,22 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
167
  @spaces.GPU(duration=100)
168
  def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
169
  """
170
- Generates a voice-over from the provided script using the Coqui TTS model.
171
  Returns the file path to the generated .wav file.
172
  """
173
  try:
174
  if not script.strip():
175
  return "Error: No script provided."
176
 
177
- # Clean the script to remove special characters (e.g., asterisks) that may produce warnings
178
  cleaned_script = clean_text(script)
179
-
180
  tts_model = get_tts_model(tts_model_name)
181
 
182
- # Generate and save voice
183
  output_path = os.path.join(tempfile.gettempdir(), "voice_over.wav")
184
  tts_model.tts_to_file(text=cleaned_script, file_path=output_path)
185
  return output_path
186
 
187
  except Exception as e:
 
188
  return f"Error generating voice: {e}"
189
 
190
 
@@ -194,7 +175,7 @@ def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/ta
194
  @spaces.GPU(duration=200)
195
  def generate_music(prompt: str, audio_length: int):
196
  """
197
- Generates music from the 'facebook/musicgen-large' model based on the prompt.
198
  Returns the file path to the generated .wav file.
199
  """
200
  try:
@@ -203,10 +184,9 @@ def generate_music(prompt: str, audio_length: int):
203
 
204
  model_key = "facebook/musicgen-large"
205
  musicgen_model, musicgen_processor = get_musicgen_model(model_key)
206
-
207
  device = "cuda" if torch.cuda.is_available() else "cpu"
208
- inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt").to(device)
209
 
 
210
  with torch.inference_mode():
211
  outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length)
212
 
@@ -219,6 +199,7 @@ def generate_music(prompt: str, audio_length: int):
219
  return output_path
220
 
221
  except Exception as e:
 
222
  return f"Error generating music: {e}"
223
 
224
 
@@ -229,9 +210,9 @@ def generate_music(prompt: str, audio_length: int):
229
  def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int = 10):
230
  """
231
  Blends two audio files (voice and music).
232
- 1. If music < voice, loops the music until it meets/exceeds the voice duration.
233
- 2. If music > voice, trims music to the voice duration.
234
- 3. If ducking=True, the music is attenuated by 'duck_level' dB while the voice is playing.
235
  Returns the file path to the blended .wav file.
236
  """
237
  try:
@@ -242,18 +223,16 @@ def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int
242
  music = AudioSegment.from_wav(music_path)
243
 
244
  voice_len = len(voice) # in milliseconds
245
- music_len = len(music) # in milliseconds
246
 
247
- # Loop music if it's shorter than the voice
248
- if music_len < voice_len:
249
  looped_music = AudioSegment.empty()
250
  while len(looped_music) < voice_len:
251
  looped_music += music
252
  music = looped_music
253
 
254
- # Trim music if it's longer than the voice
255
- if len(music) > voice_len:
256
- music = music[:voice_len]
257
 
258
  if ducking:
259
  ducked_music = music - duck_level
@@ -266,11 +245,12 @@ def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int
266
  return output_path
267
 
268
  except Exception as e:
 
269
  return f"Error blending audio: {e}"
270
 
271
 
272
  # ---------------------------------------------------------------------
273
- # Gradio Interface with Enhanced UI
274
  # ---------------------------------------------------------------------
275
  with gr.Blocks(css="""
276
  /* Global Styles */
@@ -314,26 +294,26 @@ with gr.Blocks(css="""
314
  # Custom Header
315
  with gr.Row(elem_classes="header"):
316
  gr.Markdown("""
317
- <h1>🎧 AI Promo Studio</h1>
318
- <p>Your all-in-one AI solution for crafting engaging audio promos.</p>
319
  """)
320
 
321
  gr.Markdown("""
322
- Welcome to **AI Promo Studio**! This platform leverages state-of-the-art AI models to help you generate:
323
 
324
- - **Script**: Generate a compelling voice-over script with LLaMA.
325
- - **Voice Synthesis**: Create natural-sounding voice-overs using Coqui TTS.
326
- - **Music Production**: Produce custom music tracks with MusicGen.
327
- - **Audio Blending**: Seamlessly blend voice and music with options for ducking.
328
  """)
329
 
330
  with gr.Tabs():
331
- # Step 1: Generate Script
332
  with gr.Tab("📝 Script Generation"):
333
  with gr.Row():
334
  user_prompt = gr.Textbox(
335
  label="Promo Idea",
336
- placeholder="E.g., A 30-second promo for a morning show...",
337
  lines=2
338
  )
339
  with gr.Row():
@@ -343,7 +323,7 @@ with gr.Blocks(css="""
343
  placeholder="Enter a valid Hugging Face model ID"
344
  )
345
  duration = gr.Slider(
346
- label="Desired Promo Duration (seconds)",
347
  minimum=15,
348
  maximum=60,
349
  step=15,
@@ -355,12 +335,12 @@ with gr.Blocks(css="""
355
  music_suggestion_output = gr.Textbox(label="Music Suggestions", lines=3, interactive=False)
356
 
357
  generate_script_button.click(
358
- fn=lambda user_prompt, model_id, dur: generate_script(user_prompt, model_id, HF_TOKEN, dur),
359
  inputs=[user_prompt, llama_model_id, duration],
360
  outputs=[script_output, sound_design_output, music_suggestion_output],
361
  )
362
 
363
- # Step 2: Generate Voice
364
  with gr.Tab("🎤 Voice Synthesis"):
365
  gr.Markdown("Generate a natural-sounding voice-over using Coqui TTS.")
366
  selected_tts_model = gr.Dropdown(
@@ -382,7 +362,7 @@ with gr.Blocks(css="""
382
  outputs=voice_audio_output,
383
  )
384
 
385
- # Step 3: Generate Music
386
  with gr.Tab("🎶 Music Production"):
387
  gr.Markdown("Generate a custom music track using the **MusicGen Large** model.")
388
  audio_length = gr.Slider(
@@ -397,12 +377,12 @@ with gr.Blocks(css="""
397
  music_output = gr.Audio(label="Generated Music (WAV)", type="filepath")
398
 
399
  generate_music_button.click(
400
- fn=lambda music_suggestion, length: generate_music(music_suggestion, length),
401
  inputs=[music_suggestion_output, audio_length],
402
  outputs=[music_output],
403
  )
404
 
405
- # Step 4: Blend Audio
406
  with gr.Tab("🎚️ Audio Blending"):
407
  gr.Markdown("Blend your voice-over and music track. Music will be looped/truncated to match the voice duration. Enable ducking to lower the music during voice segments.")
408
  ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
@@ -428,7 +408,7 @@ with gr.Blocks(css="""
428
  <hr>
429
  Created with ❤️ by <a href="https://bilsimaging.com" target="_blank" style="color: #88aaff;">bilsimaging.com</a>
430
  <br>
431
- <small>AI Promo Studio &copy; 2025</small>
432
  </div>
433
  """)
434
 
 
2
  import re
3
  import torch
4
  import tempfile
5
+ import logging
6
  from scipy.io.wavfile import write
7
  from pydub import AudioSegment
8
  from dotenv import load_dotenv
 
21
  from TTS.api import TTS
22
 
23
  # ---------------------------------------------------------------------
24
+ # Setup Logging and Environment Variables
25
  # ---------------------------------------------------------------------
26
+ logging.basicConfig(level=logging.INFO)
27
  load_dotenv()
28
  HF_TOKEN = os.getenv("HF_TOKEN")
29
 
 
41
  """
42
  Removes undesired characters (e.g., asterisks) that might not be recognized by the model's vocabulary.
43
  """
 
44
  return re.sub(r'\*', '', text)
45
 
46
  # ---------------------------------------------------------------------
47
+ # Model Helper Functions
48
  # ---------------------------------------------------------------------
49
  def get_llama_pipeline(model_id: str, token: str):
50
  """
51
+ Returns a cached LLaMA text-generation pipeline if available; otherwise, loads and caches it.
52
  """
53
  if model_id in LLAMA_PIPELINES:
54
  return LLAMA_PIPELINES[model_id]
 
68
 
69
  def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
70
  """
71
+ Returns a cached MusicGen model and processor if available; otherwise, loads and caches them.
72
  Uses the 'large' variant for higher quality outputs.
73
  """
74
  if model_key in MUSICGEN_MODELS:
 
76
 
77
  model = MusicgenForConditionalGeneration.from_pretrained(model_key)
78
  processor = AutoProcessor.from_pretrained(model_key)
 
79
  device = "cuda" if torch.cuda.is_available() else "cpu"
80
  model.to(device)
81
  MUSICGEN_MODELS[model_key] = (model, processor)
 
84
 
85
  def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
86
  """
87
+ Returns a cached TTS model if available; otherwise, loads and caches it.
88
  """
89
  if model_name in TTS_MODELS:
90
  return TTS_MODELS[model_name]
 
100
  @spaces.GPU(duration=100)
101
  def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
102
  """
103
+ Generates a voice-over script, sound design suggestions, and music ideas from a user prompt.
104
+ Returns a tuple: (voice_script, sound_design, music_suggestions).
105
  """
106
  try:
107
  text_pipeline = get_llama_pipeline(model_id, token)
108
 
109
  system_prompt = (
110
  "You are an expert radio imaging producer specializing in sound design and music. "
111
+ f"Based on the user's concept and the selected duration of {duration} seconds, produce the following:\n"
112
+ "1. A concise voice-over script. Prefix this section with 'Voice-Over Script:'\n"
113
+ "2. Suggestions for sound design. Prefix this section with 'Sound Design Suggestions:'\n"
114
+ "3. Music styles or track recommendations. Prefix this section with 'Music Suggestions:'"
115
  )
116
  combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nOutput:"
117
 
 
127
  if "Output:" in generated_text:
128
  generated_text = generated_text.split("Output:")[-1].strip()
129
 
130
+ # Try to extract sections using regex; fall back to defaults if not found.
131
+ pattern = r"Voice-Over Script:\s*(.*?)\s*Sound Design Suggestions:\s*(.*?)\s*Music Suggestions:\s*(.*)"
132
+ match = re.search(pattern, generated_text, re.DOTALL)
133
+ if match:
134
+ voice_script, sound_design, music_suggestions = (grp.strip() for grp in match.groups())
135
+ else:
136
+ voice_script = "No voice-over script found."
137
+ sound_design = "No sound design suggestions found."
138
+ music_suggestions = "No music suggestions found."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
  return voice_script, sound_design, music_suggestions
141
 
142
  except Exception as e:
143
+ logging.exception("Error generating script")
144
  return f"Error generating script: {e}", "", ""
145
 
146
 
 
150
  @spaces.GPU(duration=100)
151
  def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
152
  """
153
+ Generates a voice-over audio file from the provided script using Coqui TTS.
154
  Returns the file path to the generated .wav file.
155
  """
156
  try:
157
  if not script.strip():
158
  return "Error: No script provided."
159
 
 
160
  cleaned_script = clean_text(script)
 
161
  tts_model = get_tts_model(tts_model_name)
162
 
 
163
  output_path = os.path.join(tempfile.gettempdir(), "voice_over.wav")
164
  tts_model.tts_to_file(text=cleaned_script, file_path=output_path)
165
  return output_path
166
 
167
  except Exception as e:
168
+ logging.exception("Error generating voice")
169
  return f"Error generating voice: {e}"
170
 
171
 
 
175
  @spaces.GPU(duration=200)
176
  def generate_music(prompt: str, audio_length: int):
177
  """
178
+ Generates a music track from the 'facebook/musicgen-large' model based on the prompt.
179
  Returns the file path to the generated .wav file.
180
  """
181
  try:
 
184
 
185
  model_key = "facebook/musicgen-large"
186
  musicgen_model, musicgen_processor = get_musicgen_model(model_key)
 
187
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
188
 
189
+ inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt").to(device)
190
  with torch.inference_mode():
191
  outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length)
192
 
 
199
  return output_path
200
 
201
  except Exception as e:
202
+ logging.exception("Error generating music")
203
  return f"Error generating music: {e}"
204
 
205
 
 
210
  def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int = 10):
211
  """
212
  Blends two audio files (voice and music).
213
+ - Loops music if shorter than voice.
214
+ - Trims music if longer than voice.
215
+ - Applies ducking to lower music volume during voice segments if enabled.
216
  Returns the file path to the blended .wav file.
217
  """
218
  try:
 
223
  music = AudioSegment.from_wav(music_path)
224
 
225
  voice_len = len(voice) # in milliseconds
 
226
 
227
+ # Loop music if it's shorter than voice
228
+ if len(music) < voice_len:
229
  looped_music = AudioSegment.empty()
230
  while len(looped_music) < voice_len:
231
  looped_music += music
232
  music = looped_music
233
 
234
+ # Trim music to match voice duration
235
+ music = music[:voice_len]
 
236
 
237
  if ducking:
238
  ducked_music = music - duck_level
 
245
  return output_path
246
 
247
  except Exception as e:
248
+ logging.exception("Error blending audio")
249
  return f"Error blending audio: {e}"
250
 
251
 
252
  # ---------------------------------------------------------------------
253
+ # Gradio Interface with Enhanced UI for Ai Ads Promo
254
  # ---------------------------------------------------------------------
255
  with gr.Blocks(css="""
256
  /* Global Styles */
 
294
  # Custom Header
295
  with gr.Row(elem_classes="header"):
296
  gr.Markdown("""
297
+ <h1>🎧 Ai Ads Promo</h1>
298
+ <p>Your all-in-one AI solution for crafting engaging audio ads.</p>
299
  """)
300
 
301
  gr.Markdown("""
302
+ Welcome to **Ai Ads Promo**! This platform leverages state-of-the-art AI models to help you generate:
303
 
304
+ - **Script**: Create a compelling voice-over script using LLaMA.
305
+ - **Voice Synthesis**: Produce natural-sounding voice-overs with Coqui TTS.
306
+ - **Music Production**: Generate custom music tracks with MusicGen.
307
+ - **Audio Blending**: Seamlessly blend voice and music with optional ducking.
308
  """)
309
 
310
  with gr.Tabs():
311
+ # Step 1: Script Generation
312
  with gr.Tab("📝 Script Generation"):
313
  with gr.Row():
314
  user_prompt = gr.Textbox(
315
  label="Promo Idea",
316
+ placeholder="E.g., A 30-second ad for a morning show...",
317
  lines=2
318
  )
319
  with gr.Row():
 
323
  placeholder="Enter a valid Hugging Face model ID"
324
  )
325
  duration = gr.Slider(
326
+ label="Desired Ad Duration (seconds)",
327
  minimum=15,
328
  maximum=60,
329
  step=15,
 
335
  music_suggestion_output = gr.Textbox(label="Music Suggestions", lines=3, interactive=False)
336
 
337
  generate_script_button.click(
338
+ fn=lambda prompt, model_id, dur: generate_script(prompt, model_id, HF_TOKEN, dur),
339
  inputs=[user_prompt, llama_model_id, duration],
340
  outputs=[script_output, sound_design_output, music_suggestion_output],
341
  )
342
 
343
+ # Step 2: Voice Synthesis
344
  with gr.Tab("🎤 Voice Synthesis"):
345
  gr.Markdown("Generate a natural-sounding voice-over using Coqui TTS.")
346
  selected_tts_model = gr.Dropdown(
 
362
  outputs=voice_audio_output,
363
  )
364
 
365
+ # Step 3: Music Production
366
  with gr.Tab("🎶 Music Production"):
367
  gr.Markdown("Generate a custom music track using the **MusicGen Large** model.")
368
  audio_length = gr.Slider(
 
377
  music_output = gr.Audio(label="Generated Music (WAV)", type="filepath")
378
 
379
  generate_music_button.click(
380
+ fn=lambda music_prompt, length: generate_music(music_prompt, length),
381
  inputs=[music_suggestion_output, audio_length],
382
  outputs=[music_output],
383
  )
384
 
385
+ # Step 4: Audio Blending
386
  with gr.Tab("🎚️ Audio Blending"):
387
  gr.Markdown("Blend your voice-over and music track. Music will be looped/truncated to match the voice duration. Enable ducking to lower the music during voice segments.")
388
  ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
 
408
  <hr>
409
  Created with ❤️ by <a href="https://bilsimaging.com" target="_blank" style="color: #88aaff;">bilsimaging.com</a>
410
  <br>
411
+ <small>Ai Ads Promo &copy; 2025</small>
412
  </div>
413
  """)
414