Bils commited on
Commit
7602ef4
·
verified ·
1 Parent(s): 7b70b10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -91
app.py CHANGED
@@ -3,6 +3,8 @@ import re
3
  import torch
4
  import tempfile
5
  import logging
 
 
6
  from scipy.io.wavfile import write
7
  from pydub import AudioSegment
8
  from dotenv import load_dotenv
@@ -30,37 +32,53 @@ from packaging import version
30
  # ---------------------------------------------------------------------
31
  # Setup Logging and Environment Variables
32
  # ---------------------------------------------------------------------
33
- logging.basicConfig(level=logging.INFO)
34
  load_dotenv()
35
  HF_TOKEN = os.getenv("HF_TOKEN")
 
 
36
 
37
  # ---------------------------------------------------------------------
38
  # Global Model Caches
39
  # ---------------------------------------------------------------------
40
- LLAMA_PIPELINES = {}
41
- MUSICGEN_MODELS = {}
42
- TTS_MODELS = {}
43
- SOUND_DESIGN_PIPELINES = {}
44
 
45
  # ---------------------------------------------------------------------
46
- # Utility Function
47
  # ---------------------------------------------------------------------
48
  def clean_text(text: str) -> str:
49
  """
50
- Removes undesired characters (e.g., asterisks) that might not be recognized by the model's vocabulary.
 
 
 
 
 
 
51
  """
52
  return re.sub(r'\*', '', text)
53
 
54
  # ---------------------------------------------------------------------
55
  # Model Helper Functions
56
  # ---------------------------------------------------------------------
57
- def get_llama_pipeline(model_id: str, token: str):
58
  """
59
- Returns a cached LLaMA text-generation pipeline if available; otherwise, loads and caches it.
 
 
 
 
 
 
 
60
  """
61
  if model_id in LLAMA_PIPELINES:
62
  return LLAMA_PIPELINES[model_id]
63
 
 
64
  tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
65
  model = AutoModelForCausalLM.from_pretrained(
66
  model_id,
@@ -73,14 +91,20 @@ def get_llama_pipeline(model_id: str, token: str):
73
  LLAMA_PIPELINES[model_id] = text_pipeline
74
  return text_pipeline
75
 
76
- def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
77
  """
78
- Returns a cached MusicGen model and processor if available; otherwise, loads and caches them.
79
- Uses the 'large' variant for higher quality outputs.
 
 
 
 
 
80
  """
81
  if model_key in MUSICGEN_MODELS:
82
  return MUSICGEN_MODELS[model_key]
83
 
 
84
  model = MusicgenForConditionalGeneration.from_pretrained(model_key)
85
  processor = AutoProcessor.from_pretrained(model_key)
86
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -88,30 +112,51 @@ def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
88
  MUSICGEN_MODELS[model_key] = (model, processor)
89
  return model, processor
90
 
91
- def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
92
  """
93
- Returns a cached TTS model if available; otherwise, loads and caches it.
 
 
 
 
 
 
94
  """
95
  if model_name in TTS_MODELS:
96
  return TTS_MODELS[model_name]
97
 
 
98
  tts_model = TTS(model_name)
99
  TTS_MODELS[model_name] = tts_model
100
  return tts_model
101
 
102
- def get_sound_design_pipeline(model_name: str, token: str):
103
  """
104
- Returns a cached DiffusionPipeline for sound design if available;
105
- otherwise, it loads and caches the pipeline.
 
 
 
 
106
 
107
- NOTE: AudioLDM2Pipeline is available only in diffusers>=0.21.0.
108
- Since your requirements fix diffusers==0.20.2, this function will raise an error.
 
 
 
109
  """
110
  if version.parse(diffusers.__version__) < version.parse("0.21.0"):
111
  raise ValueError("AudioLDM2 requires diffusers>=0.21.0. Please upgrade your diffusers package.")
 
112
  if model_name in SOUND_DESIGN_PIPELINES:
113
  return SOUND_DESIGN_PIPELINES[model_name]
114
- pipe = DiffusionPipeline.from_pretrained(model_name, pipeline_class=AudioLDMPipeline, use_auth_token=token)
 
 
 
 
 
 
115
  SOUND_DESIGN_PIPELINES[model_name] = pipe
116
  return pipe
117
 
@@ -119,14 +164,21 @@ def get_sound_design_pipeline(model_name: str, token: str):
119
  # Script Generation Function
120
  # ---------------------------------------------------------------------
121
  @spaces.GPU(duration=100)
122
- def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
123
  """
124
- Generates a voice-over script, sound design suggestions, and music ideas from a user prompt.
125
- Returns a tuple: (voice_script, sound_design, music_suggestions).
 
 
 
 
 
 
 
 
126
  """
127
  try:
128
  text_pipeline = get_llama_pipeline(model_id, token)
129
-
130
  system_prompt = (
131
  "You are an expert radio imaging producer specializing in sound design and music. "
132
  f"Based on the user's concept and the selected duration of {duration} seconds, produce the following:\n"
@@ -148,6 +200,7 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
148
  if "Output:" in generated_text:
149
  generated_text = generated_text.split("Output:")[-1].strip()
150
 
 
151
  pattern = r"Voice-Over Script:\s*(.*?)\s*Sound Design Suggestions:\s*(.*?)\s*Music Suggestions:\s*(.*)"
152
  match = re.search(pattern, generated_text, re.DOTALL)
153
  if match:
@@ -167,10 +220,16 @@ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
167
  # Voice-Over Generation Function
168
  # ---------------------------------------------------------------------
169
  @spaces.GPU(duration=100)
170
- def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
171
  """
172
- Generates a voice-over audio file from the provided script using Coqui TTS.
173
- Returns the file path to the generated .wav file.
 
 
 
 
 
 
174
  """
175
  try:
176
  if not script.strip():
@@ -178,7 +237,6 @@ def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/ta
178
 
179
  cleaned_script = clean_text(script)
180
  tts_model = get_tts_model(tts_model_name)
181
-
182
  output_path = os.path.join(tempfile.gettempdir(), "voice_over.wav")
183
  tts_model.tts_to_file(text=cleaned_script, file_path=output_path)
184
  return output_path
@@ -191,10 +249,16 @@ def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/ta
191
  # Music Generation Function
192
  # ---------------------------------------------------------------------
193
  @spaces.GPU(duration=200)
194
- def generate_music(prompt: str, audio_length: int):
195
  """
196
- Generates a music track from the 'facebook/musicgen-large' model based on the prompt.
197
- Returns the file path to the generated .wav file.
 
 
 
 
 
 
198
  """
199
  try:
200
  if not prompt.strip():
@@ -209,11 +273,10 @@ def generate_music(prompt: str, audio_length: int):
209
  outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length)
210
 
211
  audio_data = outputs[0, 0].cpu().numpy()
212
- normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")
213
-
214
  output_path = os.path.join(tempfile.gettempdir(), "musicgen_large_generated_music.wav")
215
  write(output_path, 44100, normalized_audio)
216
-
217
  return output_path
218
 
219
  except Exception as e:
@@ -224,21 +287,23 @@ def generate_music(prompt: str, audio_length: int):
224
  # Sound Design Generation Function
225
  # ---------------------------------------------------------------------
226
  @spaces.GPU(duration=200)
227
- def generate_sound_design(prompt: str):
228
  """
229
- Generates a sound design audio file based on the provided prompt using AudioLDM 2.
230
- Returns the file path to the generated .wav file.
 
 
 
 
 
231
  """
232
  try:
233
  if not prompt.strip():
234
  return "Error: No sound design suggestion provided."
235
 
236
  pipe = get_sound_design_pipeline("cvssp/audioldm2", HF_TOKEN)
237
-
238
- # Generate audio from the prompt; assumes the pipeline returns a dict with key 'audios'
239
- result = pipe(prompt)
240
  audio_samples = result["audios"][0]
241
-
242
  normalized_audio = (audio_samples / np.max(np.abs(audio_samples)) * 32767).astype("int16")
243
  output_path = os.path.join(tempfile.gettempdir(), "sound_design_generated.wav")
244
  write(output_path, 44100, normalized_audio)
@@ -249,19 +314,27 @@ def generate_sound_design(prompt: str):
249
  return f"Error generating sound design: {e}"
250
 
251
  # ---------------------------------------------------------------------
252
- # Audio Blending with Duration Sync & Ducking (Voice + Sound Design + Music)
253
  # ---------------------------------------------------------------------
254
  @spaces.GPU(duration=100)
255
- def blend_audio(voice_path: str, sound_effect_path: str, music_path: str, ducking: bool, duck_level: int = 10):
256
  """
257
- Blends three audio files (voice, sound design/sound effect, and music):
258
- - Loops music and sound design if shorter than the voice track.
259
- - Trims both to match the voice duration.
260
- - Applies ducking to lower music and sound design volumes during voice segments if enabled.
261
- Returns the file path to the blended .wav file.
 
 
 
 
 
 
 
 
 
262
  """
263
  try:
264
- # Verify input files exist
265
  for path in [voice_path, sound_effect_path, music_path]:
266
  if not os.path.isfile(path):
267
  return f"Error: Missing audio file for {path}"
@@ -270,34 +343,30 @@ def blend_audio(voice_path: str, sound_effect_path: str, music_path: str, duckin
270
  voice = AudioSegment.from_wav(voice_path)
271
  music = AudioSegment.from_wav(music_path)
272
  sound_effect = AudioSegment.from_wav(sound_effect_path)
273
-
274
  voice_len = len(voice) # duration in milliseconds
275
 
276
- # Loop or trim music to match voice duration
277
  if len(music) < voice_len:
278
- looped_music = AudioSegment.empty()
279
- while len(looped_music) < voice_len:
280
- looped_music += music
281
- music = looped_music
282
- music = music[:voice_len]
283
 
284
- # Loop or trim sound effect to match voice duration
285
  if len(sound_effect) < voice_len:
286
- looped_effect = AudioSegment.empty()
287
- while len(looped_effect) < voice_len:
288
- looped_effect += sound_effect
289
- sound_effect = looped_effect
290
- sound_effect = sound_effect[:voice_len]
291
 
292
- # Apply ducking to background tracks if enabled
293
  if ducking:
294
  music = music - duck_level
295
  sound_effect = sound_effect - duck_level
296
 
297
- # Combine music and sound effect into a background track
298
  background = music.overlay(sound_effect)
299
-
300
- # Overlay voice on top of the background
301
  final_audio = background.overlay(voice)
302
 
303
  output_path = os.path.join(tempfile.gettempdir(), "blended_output.wav")
@@ -360,25 +429,13 @@ with gr.Blocks(css="""
360
  gr.Markdown("""
361
  **Welcome to Ai Ads Promo!**
362
 
363
- This simple, easy-to-use app helps you create amazing audio ads in just a few steps. Here’s how it works:
364
-
365
- 1. **Script Generation:**
366
- - Share your idea and let our AI craft a clear and engaging voice-over script, along with sound design and music suggestions.
367
- 2. **Voice Synthesis:**
368
- - Convert your script into a natural-sounding voice-over using advanced text-to-speech technology.
369
- 3. **Music Production:**
370
- - Generate a custom music track that perfectly fits your ad.
371
- 4. **Sound Design:**
372
- - Generate creative sound effects based on our sound design suggestions.
373
- 5. **Audio Blending:**
374
- - Combine your voice-over, sound effects, and music seamlessly. Enable ducking to lower background audio during voice segments.
375
-
376
- **Benefits:**
377
- - **Easy to Use:** Designed for everyone – no technical skills required.
378
- - **Fast Results:** Quickly produce professional-sounding audio ads.
379
- - **All-In-One:** Everything you need in one convenient app.
380
-
381
- Get started now and create your perfect audio ad with Ai Ads Promo!
382
  """)
383
 
384
  with gr.Tabs():
@@ -458,7 +515,7 @@ with gr.Blocks(css="""
458
 
459
  # Step 4: Sound Design Generation
460
  with gr.Tab("🎧 Sound Design Generation"):
461
- gr.Markdown("Generate a creative sound design track based on the sound design suggestions from the script.")
462
  generate_sound_design_button = gr.Button("Generate Sound Design", variant="primary")
463
  sound_design_audio_output = gr.Audio(label="Generated Sound Design (WAV)", type="filepath")
464
 
@@ -470,7 +527,7 @@ with gr.Blocks(css="""
470
 
471
  # Step 5: Audio Blending (Voice + Sound Design + Music)
472
  with gr.Tab("🎚️ Audio Blending"):
473
- gr.Markdown("Blend your voice-over, sound design, and music track. The background audio (music and sound design) can be ducked during voice segments.")
474
  ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
475
  duck_level_slider = gr.Slider(
476
  label="Ducking Level (dB attenuation)",
@@ -488,7 +545,7 @@ with gr.Blocks(css="""
488
  outputs=blended_output
489
  )
490
 
491
- # Footer
492
  gr.Markdown("""
493
  <div class="footer">
494
  <hr>
@@ -497,8 +554,6 @@ with gr.Blocks(css="""
497
  <small>Ai Ads Promo &copy; 2025</small>
498
  </div>
499
  """)
500
-
501
- # Visitor Badge
502
  gr.HTML("""
503
  <div style="text-align: center; margin-top: 1rem;">
504
  <a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold">
@@ -507,4 +562,5 @@ with gr.Blocks(css="""
507
  </div>
508
  """)
509
 
510
- demo.launch(debug=True)
 
 
3
  import torch
4
  import tempfile
5
  import logging
6
+ import math
7
+ from typing import Tuple, Union, Any
8
  from scipy.io.wavfile import write
9
  from pydub import AudioSegment
10
  from dotenv import load_dotenv
 
32
  # ---------------------------------------------------------------------
33
  # Setup Logging and Environment Variables
34
  # ---------------------------------------------------------------------
35
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
36
  load_dotenv()
37
  HF_TOKEN = os.getenv("HF_TOKEN")
38
+ if not HF_TOKEN:
39
+ logging.warning("HF_TOKEN is not set in your environment. Some model downloads might fail.")
40
 
41
  # ---------------------------------------------------------------------
42
  # Global Model Caches
43
  # ---------------------------------------------------------------------
44
+ LLAMA_PIPELINES: dict[str, Any] = {}
45
+ MUSICGEN_MODELS: dict[str, Any] = {}
46
+ TTS_MODELS: dict[str, Any] = {}
47
+ SOUND_DESIGN_PIPELINES: dict[str, Any] = {}
48
 
49
  # ---------------------------------------------------------------------
50
+ # Utility Functions
51
  # ---------------------------------------------------------------------
52
  def clean_text(text: str) -> str:
53
  """
54
+ Remove undesired characters that may not be recognized by the model.
55
+
56
+ Args:
57
+ text (str): Input text to be cleaned.
58
+
59
+ Returns:
60
+ str: Cleaned text.
61
  """
62
  return re.sub(r'\*', '', text)
63
 
64
  # ---------------------------------------------------------------------
65
  # Model Helper Functions
66
  # ---------------------------------------------------------------------
67
+ def get_llama_pipeline(model_id: str, token: str) -> Any:
68
  """
69
+ Returns a cached LLaMA text-generation pipeline or loads a new one.
70
+
71
+ Args:
72
+ model_id (str): Hugging Face model ID.
73
+ token (str): Hugging Face token.
74
+
75
+ Returns:
76
+ Any: A Hugging Face text-generation pipeline.
77
  """
78
  if model_id in LLAMA_PIPELINES:
79
  return LLAMA_PIPELINES[model_id]
80
 
81
+ logging.info(f"Loading LLaMA model from {model_id}...")
82
  tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
83
  model = AutoModelForCausalLM.from_pretrained(
84
  model_id,
 
91
  LLAMA_PIPELINES[model_id] = text_pipeline
92
  return text_pipeline
93
 
94
+ def get_musicgen_model(model_key: str = "facebook/musicgen-large") -> Tuple[Any, Any]:
95
  """
96
+ Returns a cached MusicGen model and processor, or loads new ones.
97
+
98
+ Args:
99
+ model_key (str): Hugging Face model key (default is 'facebook/musicgen-large').
100
+
101
+ Returns:
102
+ Tuple[Any, Any]: The MusicGen model and its processor.
103
  """
104
  if model_key in MUSICGEN_MODELS:
105
  return MUSICGEN_MODELS[model_key]
106
 
107
+ logging.info(f"Loading MusicGen model from {model_key}...")
108
  model = MusicgenForConditionalGeneration.from_pretrained(model_key)
109
  processor = AutoProcessor.from_pretrained(model_key)
110
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
112
  MUSICGEN_MODELS[model_key] = (model, processor)
113
  return model, processor
114
 
115
+ def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC") -> TTS:
116
  """
117
+ Returns a cached TTS model or loads a new one.
118
+
119
+ Args:
120
+ model_name (str): Identifier for the TTS model.
121
+
122
+ Returns:
123
+ TTS: A Coqui TTS model.
124
  """
125
  if model_name in TTS_MODELS:
126
  return TTS_MODELS[model_name]
127
 
128
+ logging.info(f"Loading TTS model: {model_name}...")
129
  tts_model = TTS(model_name)
130
  TTS_MODELS[model_name] = tts_model
131
  return tts_model
132
 
133
+ def get_sound_design_pipeline(model_name: str, token: str) -> Any:
134
  """
135
+ Returns a cached DiffusionPipeline for sound design, or loads a new one.
136
+ Raises an error if diffusers version is less than 0.21.0.
137
+
138
+ Args:
139
+ model_name (str): The model name to load.
140
+ token (str): Hugging Face token.
141
 
142
+ Returns:
143
+ Any: A DiffusionPipeline for sound design.
144
+
145
+ Raises:
146
+ ValueError: If diffusers version is lower than 0.21.0.
147
  """
148
  if version.parse(diffusers.__version__) < version.parse("0.21.0"):
149
  raise ValueError("AudioLDM2 requires diffusers>=0.21.0. Please upgrade your diffusers package.")
150
+
151
  if model_name in SOUND_DESIGN_PIPELINES:
152
  return SOUND_DESIGN_PIPELINES[model_name]
153
+
154
+ logging.info(f"Loading sound design pipeline from {model_name}...")
155
+ pipe = DiffusionPipeline.from_pretrained(
156
+ model_name,
157
+ pipeline_class=AudioLDMPipeline,
158
+ use_auth_token=token
159
+ )
160
  SOUND_DESIGN_PIPELINES[model_name] = pipe
161
  return pipe
162
 
 
164
  # Script Generation Function
165
  # ---------------------------------------------------------------------
166
  @spaces.GPU(duration=100)
167
+ def generate_script(user_prompt: str, model_id: str, token: str, duration: int) -> Tuple[str, str, str]:
168
  """
169
+ Generates a voice-over script, sound design suggestions, and music ideas based on the user prompt.
170
+
171
+ Args:
172
+ user_prompt (str): The user-provided concept.
173
+ model_id (str): The LLaMA model ID.
174
+ token (str): Hugging Face token.
175
+ duration (int): The desired duration in seconds.
176
+
177
+ Returns:
178
+ Tuple[str, str, str]: Voice-over script, sound design suggestions, and music suggestions.
179
  """
180
  try:
181
  text_pipeline = get_llama_pipeline(model_id, token)
 
182
  system_prompt = (
183
  "You are an expert radio imaging producer specializing in sound design and music. "
184
  f"Based on the user's concept and the selected duration of {duration} seconds, produce the following:\n"
 
200
  if "Output:" in generated_text:
201
  generated_text = generated_text.split("Output:")[-1].strip()
202
 
203
+ # Extract sections using regex
204
  pattern = r"Voice-Over Script:\s*(.*?)\s*Sound Design Suggestions:\s*(.*?)\s*Music Suggestions:\s*(.*)"
205
  match = re.search(pattern, generated_text, re.DOTALL)
206
  if match:
 
220
  # Voice-Over Generation Function
221
  # ---------------------------------------------------------------------
222
  @spaces.GPU(duration=100)
223
+ def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/tacotron2-DDC") -> Union[str, Any]:
224
  """
225
+ Generates a voice-over audio file from a script using Coqui TTS.
226
+
227
+ Args:
228
+ script (str): The voice-over script.
229
+ tts_model_name (str): The TTS model name.
230
+
231
+ Returns:
232
+ Union[str, Any]: The file path to the generated .wav file or an error message.
233
  """
234
  try:
235
  if not script.strip():
 
237
 
238
  cleaned_script = clean_text(script)
239
  tts_model = get_tts_model(tts_model_name)
 
240
  output_path = os.path.join(tempfile.gettempdir(), "voice_over.wav")
241
  tts_model.tts_to_file(text=cleaned_script, file_path=output_path)
242
  return output_path
 
249
  # Music Generation Function
250
  # ---------------------------------------------------------------------
251
  @spaces.GPU(duration=200)
252
+ def generate_music(prompt: str, audio_length: int) -> Union[str, Any]:
253
  """
254
+ Generates a music track using the MusicGen model based on the prompt.
255
+
256
+ Args:
257
+ prompt (str): Music suggestion prompt.
258
+ audio_length (int): Number of tokens determining audio length.
259
+
260
+ Returns:
261
+ Union[str, Any]: The file path to the generated .wav file or an error message.
262
  """
263
  try:
264
  if not prompt.strip():
 
273
  outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length)
274
 
275
  audio_data = outputs[0, 0].cpu().numpy()
276
+ # Normalize audio data to 16-bit integer range
277
+ normalized_audio = (audio_data / np.max(np.abs(audio_data)) * 32767).astype("int16")
278
  output_path = os.path.join(tempfile.gettempdir(), "musicgen_large_generated_music.wav")
279
  write(output_path, 44100, normalized_audio)
 
280
  return output_path
281
 
282
  except Exception as e:
 
287
  # Sound Design Generation Function
288
  # ---------------------------------------------------------------------
289
  @spaces.GPU(duration=200)
290
+ def generate_sound_design(prompt: str) -> Union[str, Any]:
291
  """
292
+ Generates a sound design audio file using AudioLDM 2 based on the prompt.
293
+
294
+ Args:
295
+ prompt (str): Sound design prompt.
296
+
297
+ Returns:
298
+ Union[str, Any]: The file path to the generated .wav file or an error message.
299
  """
300
  try:
301
  if not prompt.strip():
302
  return "Error: No sound design suggestion provided."
303
 
304
  pipe = get_sound_design_pipeline("cvssp/audioldm2", HF_TOKEN)
305
+ result = pipe(prompt) # Expected to return a dict with key 'audios'
 
 
306
  audio_samples = result["audios"][0]
 
307
  normalized_audio = (audio_samples / np.max(np.abs(audio_samples)) * 32767).astype("int16")
308
  output_path = os.path.join(tempfile.gettempdir(), "sound_design_generated.wav")
309
  write(output_path, 44100, normalized_audio)
 
314
  return f"Error generating sound design: {e}"
315
 
316
  # ---------------------------------------------------------------------
317
+ # Audio Blending Function
318
  # ---------------------------------------------------------------------
319
  @spaces.GPU(duration=100)
320
+ def blend_audio(voice_path: str, sound_effect_path: str, music_path: str, ducking: bool, duck_level: int = 10) -> Union[str, Any]:
321
  """
322
+ Blends three audio files (voice, sound design, and music) by:
323
+ - Looping/trimming music and sound design to match voice duration.
324
+ - Optionally applying ducking to background tracks.
325
+ - Overlaying the voice on top of the background.
326
+
327
+ Args:
328
+ voice_path (str): Path to the voice audio file.
329
+ sound_effect_path (str): Path to the sound design audio file.
330
+ music_path (str): Path to the music audio file.
331
+ ducking (bool): Whether to apply ducking.
332
+ duck_level (int): Amount of attenuation in dB.
333
+
334
+ Returns:
335
+ Union[str, Any]: The file path to the blended .wav file or an error message.
336
  """
337
  try:
 
338
  for path in [voice_path, sound_effect_path, music_path]:
339
  if not os.path.isfile(path):
340
  return f"Error: Missing audio file for {path}"
 
343
  voice = AudioSegment.from_wav(voice_path)
344
  music = AudioSegment.from_wav(music_path)
345
  sound_effect = AudioSegment.from_wav(sound_effect_path)
 
346
  voice_len = len(voice) # duration in milliseconds
347
 
348
+ # Loop or trim music to match voice duration using pydub multiplication
349
  if len(music) < voice_len:
350
+ repeats = math.ceil(voice_len / len(music))
351
+ music = (music * repeats)[:voice_len]
352
+ else:
353
+ music = music[:voice_len]
 
354
 
355
+ # Loop or trim sound design to match voice duration
356
  if len(sound_effect) < voice_len:
357
+ repeats = math.ceil(voice_len / len(sound_effect))
358
+ sound_effect = (sound_effect * repeats)[:voice_len]
359
+ else:
360
+ sound_effect = sound_effect[:voice_len]
 
361
 
362
+ # Apply ducking if enabled
363
  if ducking:
364
  music = music - duck_level
365
  sound_effect = sound_effect - duck_level
366
 
367
+ # Overlay music and sound effect for background
368
  background = music.overlay(sound_effect)
369
+ # Overlay voice on top of background
 
370
  final_audio = background.overlay(voice)
371
 
372
  output_path = os.path.join(tempfile.gettempdir(), "blended_output.wav")
 
429
  gr.Markdown("""
430
  **Welcome to Ai Ads Promo!**
431
 
432
+ This app helps you create amazing audio ads in just a few steps:
433
+
434
+ 1. **Script Generation:** Provide your idea and get a voice-over script, sound design, and music suggestions.
435
+ 2. **Voice Synthesis:** Convert the script into natural-sounding speech.
436
+ 3. **Music Production:** Generate a custom music track.
437
+ 4. **Sound Design:** Create creative sound effects.
438
+ 5. **Audio Blending:** Seamlessly blend voice, music, and sound design (with optional ducking).
 
 
 
 
 
 
 
 
 
 
 
 
439
  """)
440
 
441
  with gr.Tabs():
 
515
 
516
  # Step 4: Sound Design Generation
517
  with gr.Tab("🎧 Sound Design Generation"):
518
+ gr.Markdown("Generate a creative sound design track based on the script's suggestions.")
519
  generate_sound_design_button = gr.Button("Generate Sound Design", variant="primary")
520
  sound_design_audio_output = gr.Audio(label="Generated Sound Design (WAV)", type="filepath")
521
 
 
527
 
528
  # Step 5: Audio Blending (Voice + Sound Design + Music)
529
  with gr.Tab("🎚️ Audio Blending"):
530
+ gr.Markdown("Blend your voice-over, sound design, and music track. Enable ducking to lower background audio during voice segments.")
531
  ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
532
  duck_level_slider = gr.Slider(
533
  label="Ducking Level (dB attenuation)",
 
545
  outputs=blended_output
546
  )
547
 
548
+ # Footer and Visitor Badge
549
  gr.Markdown("""
550
  <div class="footer">
551
  <hr>
 
554
  <small>Ai Ads Promo &copy; 2025</small>
555
  </div>
556
  """)
 
 
557
  gr.HTML("""
558
  <div style="text-align: center; margin-top: 1rem;">
559
  <a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold">
 
562
  </div>
563
  """)
564
 
565
+ if __name__ == "__main__":
566
+ demo.launch(debug=True)