Bils commited on
Commit
0105281
·
verified ·
1 Parent(s): 7226966

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +334 -275
app.py CHANGED
@@ -13,319 +13,378 @@ from pydub import AudioSegment
13
  from dotenv import load_dotenv
14
  import tempfile
15
  import spaces
 
 
16
  from TTS.api import TTS
17
 
18
- # -------------------------------
19
- # Configuration
20
- # -------------------------------
21
  load_dotenv()
22
  HF_TOKEN = os.getenv("HF_TOKEN")
23
 
24
- MODEL_CONFIG = {
25
- "llama_models": {
26
- "Meta-Llama-3-8B": "meta-llama/Meta-Llama-3-8B-Instruct",
27
- "Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.2",
28
- "Phi-3-mini": "microsoft/Phi-3-mini-4k-instruct"
29
- },
30
- "tts_models": {
31
- "Standard English": "tts_models/en/ljspeech/tacotron2-DDC",
32
- "High Quality": "tts_models/en/ljspeech/vits",
33
- "Fast Inference": "tts_models/en/sam/tacotron-DDC"
34
- }
35
- }
36
-
37
- # -------------------------------
38
- # Model Manager
39
- # -------------------------------
40
- class ModelManager:
41
- def __init__(self):
42
- self.llama_pipelines = {}
43
- self.musicgen_models = {}
44
- self.tts_models = {}
45
-
46
- def get_llama_pipeline(self, model_id, token):
47
- if model_id not in self.llama_pipelines:
48
- tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
49
- model = AutoModelForCausalLM.from_pretrained(
50
- model_id,
51
- use_auth_token=token,
52
- torch_dtype=torch.float16,
53
- device_map="auto",
54
- attn_implementation="flash_attention_2"
55
- )
56
- self.llama_pipelines[model_id] = pipeline(
57
- "text-generation",
58
- model=model,
59
- tokenizer=tokenizer,
60
- device_map="auto"
61
- )
62
- return self.llama_pipelines[model_id]
63
-
64
- def get_musicgen_model(self, model_key="facebook/musicgen-large"):
65
- if model_key not in self.musicgen_models:
66
- model = MusicgenForConditionalGeneration.from_pretrained(model_key)
67
- processor = AutoProcessor.from_pretrained(model_key)
68
- device = "cuda" if torch.cuda.is_available() else "cpu"
69
- model.to(device)
70
- self.musicgen_models[model_key] = (model, processor)
71
- return self.musicgen_models[model_key]
72
-
73
- def get_tts_model(self, model_name):
74
- if model_name not in self.tts_models:
75
- self.tts_models[model_name] = TTS(model_name)
76
- return self.tts_models[model_name]
77
-
78
- model_manager = ModelManager()
79
-
80
- # -------------------------------
81
- # Core Functions
82
- # -------------------------------
83
- @spaces.GPU(duration=120)
84
- def generate_script(user_prompt, model_id, duration, temperature=0.7, max_tokens=512):
 
 
 
 
 
 
 
85
  try:
86
- text_pipeline = model_manager.get_llama_pipeline(model_id, HF_TOKEN)
87
-
88
- system_prompt = f"""You are an expert radio imaging producer. Create content for a {duration}-second promo:
89
- 1. Voice Script: [Clear narration]
90
- 2. Sound Design: [3-5 effects]
91
- 3. Music: [Genre/tempo/mood]
92
-
93
- Respond in this exact format:"""
94
-
95
- prompt = f"{system_prompt}\nConcept: {user_prompt}\nVoice Script:"
96
-
97
- response = text_pipeline(
98
- prompt,
99
- max_new_tokens=max_tokens,
100
- temperature=temperature,
101
- do_sample=True,
102
- top_p=0.95,
103
- eos_token_id=text_pipeline.tokenizer.eos_token_id
104
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
- return parse_generated_content(response[0]["generated_text"])
107
  except Exception as e:
108
- return f"Error: {str(e)}", "", ""
109
-
110
- def parse_generated_content(text):
111
- sections = {
112
- "Voice Script": "",
113
- "Sound Design": "",
114
- "Music": ""
115
- }
116
- current_section = None
117
-
118
- for line in text.split('\n'):
119
- line = line.strip()
120
- if "Voice Script:" in line:
121
- current_section = "Voice Script"
122
- line = line.replace("Voice Script:", "").strip()
123
- elif "Sound Design:" in line:
124
- current_section = "Sound Design"
125
- line = line.replace("Sound Design:", "").strip()
126
- elif "Music:" in line:
127
- current_section = "Music"
128
- line = line.replace("Music:", "").strip()
129
-
130
- if current_section and line:
131
- sections[current_section] += line + "\n"
132
-
133
- return sections["Voice Script"].strip(), sections["Sound Design"].strip(), sections["Music"].strip()
134
 
 
 
 
 
135
  @spaces.GPU(duration=100)
136
- def generate_voice(script, tts_model, speed=1.0):
 
 
 
 
137
  try:
138
  if not script.strip():
139
- return "Error: Empty script"
140
-
141
- tts = model_manager.get_tts_model(tts_model)
142
- output_path = os.path.join(tempfile.gettempdir(), "voice.wav")
143
-
144
- tts.tts_to_file(
145
- text=script,
146
- file_path=output_path,
147
- speed=speed
148
- )
149
  return output_path
 
150
  except Exception as e:
151
- return f"Error: {str(e)}"
152
 
153
- @spaces.GPU(duration=150)
154
- def generate_music(prompt, duration_sec=30, temperature=1.0, guidance_scale=3.0):
 
 
 
 
 
 
 
 
155
  try:
156
- model, processor = model_manager.get_musicgen_model()
 
 
 
 
 
157
  device = "cuda" if torch.cuda.is_available() else "cpu"
158
-
159
- inputs = processor(
160
- text=[prompt],
161
- padding=True,
162
- return_tensors="pt",
163
- ).to(device)
164
-
165
- audio_values = model.generate(
166
- **inputs,
167
- max_new_tokens=int(duration_sec * 50),
168
- temperature=temperature,
169
- guidance_scale=guidance_scale,
170
- do_sample=True
171
- )
172
 
173
- output_path = os.path.join(tempfile.gettempdir(), "music.wav")
174
- write(output_path, 44100, audio_values[0, 0].cpu().numpy())
175
  return output_path
 
176
  except Exception as e:
177
- return f"Error: {str(e)}"
 
178
 
179
- def blend_audio(voice_path, music_path, ducking=True, duck_level=10, crossfade=500):
 
 
 
 
 
 
 
 
 
 
 
180
  try:
 
 
 
181
  voice = AudioSegment.from_wav(voice_path)
182
  music = AudioSegment.from_wav(music_path)
183
-
184
- # Align durations with crossfade
185
- if len(music) < len(voice):
186
- loops = (len(voice) // len(music)) + 1
187
- music = music * loops
188
-
189
- music = music[:len(voice)].fade_out(crossfade)
190
- voice = voice.fade_in(crossfade)
191
-
192
- # Apply ducking
 
 
 
 
 
 
 
193
  if ducking:
 
194
  ducked_music = music - duck_level
195
- mixed = ducked_music.overlay(voice)
 
196
  else:
197
- mixed = music.overlay(voice)
198
-
199
- output_path = os.path.join(tempfile.gettempdir(), "final_mix.wav")
200
- mixed.export(output_path, format="wav")
 
201
  return output_path
 
202
  except Exception as e:
203
- return f"Error: {str(e)}"
204
 
205
- # -------------------------------
206
- # Gradio Interface
207
- # -------------------------------
208
- theme = gr.themes.Soft(
209
- primary_hue="blue",
210
- secondary_hue="teal",
211
- ).set(
212
- body_text_color_dark='#FFFFFF',
213
- background_fill_primary_dark='#1F1F1F'
214
- )
215
 
216
- with gr.Blocks(theme=theme, title="AI Radio Studio Pro") as demo:
 
 
 
217
  gr.Markdown("""
218
- # 🎧 AI Radio Studio Pro
219
- *Professional Audio Production in 4 Steps*
220
- """)
 
 
 
 
 
 
 
 
 
221
 
222
  with gr.Tabs():
223
- # Step 1: Concept Development
224
- with gr.Tab("1️⃣ Concept"):
225
- with gr.Row():
226
- with gr.Column(scale=2):
227
- concept_input = gr.Textbox(
228
- label="Your Idea",
229
- placeholder="e.g., A 30-second morning show intro with energetic music...",
230
- lines=3
231
- )
232
- with gr.Accordion("Advanced Settings", open=False):
233
- model_selector = gr.Dropdown(
234
- choices=list(MODEL_CONFIG["llama_models"].values()),
235
- label="AI Model",
236
- value=MODEL_CONFIG["llama_models"]["Meta-Llama-3-8B"]
237
- )
238
- duration_slider = gr.Slider(15, 120, 30, step=15, label="Duration (seconds)")
239
- temp_slider = gr.Slider(0.1, 1.5, 0.7, step=0.1, label="Creativity")
240
- generate_btn = gr.Button("Generate Script", variant="primary")
241
-
242
- with gr.Column(scale=1):
243
- script_output = gr.Textbox(label="Voice Script", interactive=True)
244
- sound_output = gr.Textbox(label="Sound Design", interactive=True)
245
- music_output = gr.Textbox(label="Music Style", interactive=True)
246
-
247
- # Step 2: Voice Production
248
- with gr.Tab("2️⃣ Voice"):
249
  with gr.Row():
250
- with gr.Column():
251
- tts_selector = gr.Dropdown(
252
- choices=list(MODEL_CONFIG["tts_models"].values()),
253
- label="Voice Model",
254
- value="tts_models/en/ljspeech/tacotron2-DDC"
255
- )
256
- speed_slider = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Speaking Rate")
257
- voice_btn = gr.Button("Generate Voiceover", variant="primary")
258
- with gr.Column():
259
- voice_preview = gr.Audio(label="Preview", type="filepath")
260
-
261
- # Step 3: Music Production
262
- with gr.Tab("3️⃣ Music"):
263
- with gr.Row():
264
- with gr.Column():
265
- music_duration = gr.Slider(10, 120, 30, label="Duration (seconds)")
266
- music_temp = gr.Slider(0.1, 2.0, 1.0, label="Creativity")
267
- guidance_scale = gr.Slider(1.0, 5.0, 3.0, label="Focus")
268
- music_btn = gr.Button("Generate Music", variant="primary")
269
- with gr.Column():
270
- music_preview = gr.Audio(label="Preview", type="filepath")
271
-
272
- # Step 4: Final Mix
273
- with gr.Tab("4️⃣ Mix"):
274
- with gr.Row():
275
- with gr.Column():
276
- ducking_toggle = gr.Checkbox(True, label="Enable Voice Ducking")
277
- duck_level = gr.Slider(0, 30, 12, label="Ducking Strength (dB)")
278
- crossfade_time = gr.Slider(0, 2000, 500, label="Crossfade (ms)")
279
- mix_btn = gr.Button("Create Final Mix", variant="primary")
280
- with gr.Column():
281
- final_mix = gr.Audio(label="Master Output", type="filepath")
282
-
283
- # Examples & Footer
284
- with gr.Accordion("💡 Example Prompts", open=False):
285
- gr.Examples(
286
- examples=[
287
- ["A 45-second tech podcast intro with futuristic synth effects"],
288
- ["A 15-second coffee shop radio ad with morning acoustic vibes"],
289
- ["A 60-second documentary trailer with epic orchestral music"]
290
- ],
291
- inputs=concept_input
292
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
 
 
 
 
 
 
 
294
  gr.Markdown("""
295
- <div style="text-align: center; margin-top: 30px; padding-top: 20px; border-top: 1px solid #444;">
296
- <p style="font-size: 0.9em; color: #888;">
297
- Created with ❤️ by <a href="https://bilsimaging.com" target="_blank" style="color: #66b3ff;">bilsimaging.com</a>
298
- </p>
299
- <a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold">
300
- <img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold&countColor=%23263759"/>
301
- </a>
302
- </div>
303
  """)
304
-
305
- # Event Handling
306
- generate_btn.click(
307
- generate_script,
308
- inputs=[concept_input, model_selector, duration_slider, temp_slider],
309
- outputs=[script_output, sound_output, music_output]
310
- )
311
-
312
- voice_btn.click(
313
- generate_voice,
314
- inputs=[script_output, tts_selector, speed_slider],
315
- outputs=voice_preview
316
- )
317
-
318
- music_btn.click(
319
- generate_music,
320
- inputs=[music_output, music_duration, music_temp, guidance_scale],
321
- outputs=music_preview
322
- )
323
 
324
- mix_btn.click(
325
- blend_audio,
326
- inputs=[voice_preview, music_preview, ducking_toggle, duck_level, crossfade_time],
327
- outputs=final_mix
328
- )
 
329
 
330
- if __name__ == "__main__":
331
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
13
  from dotenv import load_dotenv
14
  import tempfile
15
  import spaces
16
+
17
+ # Coqui TTS
18
  from TTS.api import TTS
19
 
20
+ # ---------------------------------------------------------------------
21
+ # Load Environment Variables
22
+ # ---------------------------------------------------------------------
23
  load_dotenv()
24
  HF_TOKEN = os.getenv("HF_TOKEN")
25
 
26
+ # ---------------------------------------------------------------------
27
+ # Global Model Caches
28
+ # ---------------------------------------------------------------------
29
+ LLAMA_PIPELINES = {}
30
+ MUSICGEN_MODELS = {}
31
+ TTS_MODELS = {}
32
+
33
+ # ---------------------------------------------------------------------
34
+ # Helper Functions
35
+ # ---------------------------------------------------------------------
36
+ def get_llama_pipeline(model_id: str, token: str):
37
+ """
38
+ Returns a cached LLaMA pipeline if available; otherwise, loads it.
39
+ """
40
+ if model_id in LLAMA_PIPELINES:
41
+ return LLAMA_PIPELINES[model_id]
42
+
43
+ tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
44
+ model = AutoModelForCausalLM.from_pretrained(
45
+ model_id,
46
+ use_auth_token=token,
47
+ torch_dtype=torch.float16,
48
+ device_map="auto",
49
+ trust_remote_code=True,
50
+ )
51
+ text_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
52
+ LLAMA_PIPELINES[model_id] = text_pipeline
53
+ return text_pipeline
54
+
55
+
56
+ def get_musicgen_model(model_key: str = "facebook/musicgen-large"):
57
+ """
58
+ Returns a cached MusicGen model if available; otherwise, loads it.
59
+ Uses the 'large' variant for higher quality outputs.
60
+ """
61
+ if model_key in MUSICGEN_MODELS:
62
+ return MUSICGEN_MODELS[model_key]
63
+
64
+ model = MusicgenForConditionalGeneration.from_pretrained(model_key)
65
+ processor = AutoProcessor.from_pretrained(model_key)
66
+
67
+ device = "cuda" if torch.cuda.is_available() else "cpu"
68
+ model.to(device)
69
+ MUSICGEN_MODELS[model_key] = (model, processor)
70
+ return model, processor
71
+
72
+
73
+ def get_tts_model(model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
74
+ """
75
+ Returns a cached TTS model if available; otherwise, loads it.
76
+ """
77
+ if model_name in TTS_MODELS:
78
+ return TTS_MODELS[model_name]
79
+
80
+ tts_model = TTS(model_name)
81
+ TTS_MODELS[model_name] = tts_model
82
+ return tts_model
83
+
84
+
85
+ # ---------------------------------------------------------------------
86
+ # Script Generation Function
87
+ # ---------------------------------------------------------------------
88
+ @spaces.GPU(duration=100)
89
+ def generate_script(user_prompt: str, model_id: str, token: str, duration: int):
90
+ """
91
+ Generates a script, sound design suggestions, and music ideas from a user prompt.
92
+ Returns a tuple of strings: (voice_script, sound_design, music_suggestions).
93
+ """
94
  try:
95
+ text_pipeline = get_llama_pipeline(model_id, token)
96
+
97
+ system_prompt = (
98
+ "You are an expert radio imaging producer specializing in sound design and music. "
99
+ f"Based on the user's concept and the selected duration of {duration} seconds, produce the following: "
100
+ "1. A concise voice-over script. Prefix this section with 'Voice-Over Script:'.\n"
101
+ "2. Suggestions for sound design. Prefix this section with 'Sound Design Suggestions:'.\n"
102
+ "3. Music styles or track recommendations. Prefix this section with 'Music Suggestions:'."
 
 
 
 
 
 
 
 
 
 
103
  )
104
+ combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nOutput:"
105
+
106
+ with torch.inference_mode():
107
+ result = text_pipeline(
108
+ combined_prompt,
109
+ max_new_tokens=300,
110
+ do_sample=True,
111
+ temperature=0.8
112
+ )
113
+
114
+ generated_text = result[0]["generated_text"]
115
+ if "Output:" in generated_text:
116
+ generated_text = generated_text.split("Output:")[-1].strip()
117
+
118
+ # Default placeholders
119
+ voice_script = "No voice-over script found."
120
+ sound_design = "No sound design suggestions found."
121
+ music_suggestions = "No music suggestions found."
122
+
123
+ # Voice-Over Script
124
+ if "Voice-Over Script:" in generated_text:
125
+ parts = generated_text.split("Voice-Over Script:")
126
+ voice_script_part = parts[1]
127
+ if "Sound Design Suggestions:" in voice_script_part:
128
+ voice_script = voice_script_part.split("Sound Design Suggestions:")[0].strip()
129
+ else:
130
+ voice_script = voice_script_part.strip()
131
+
132
+ # Sound Design
133
+ if "Sound Design Suggestions:" in generated_text:
134
+ parts = generated_text.split("Sound Design Suggestions:")
135
+ sound_design_part = parts[1]
136
+ if "Music Suggestions:" in sound_design_part:
137
+ sound_design = sound_design_part.split("Music Suggestions:")[0].strip()
138
+ else:
139
+ sound_design = sound_design_part.strip()
140
+
141
+ # Music Suggestions
142
+ if "Music Suggestions:" in generated_text:
143
+ parts = generated_text.split("Music Suggestions:")
144
+ music_suggestions = parts[1].strip()
145
+
146
+ return voice_script, sound_design, music_suggestions
147
 
 
148
  except Exception as e:
149
+ return f"Error generating script: {e}", "", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
+
152
+ # ---------------------------------------------------------------------
153
+ # Voice-Over Generation Function
154
+ # ---------------------------------------------------------------------
155
  @spaces.GPU(duration=100)
156
+ def generate_voice(script: str, tts_model_name: str = "tts_models/en/ljspeech/tacotron2-DDC"):
157
+ """
158
+ Generates a voice-over from the provided script using the Coqui TTS model.
159
+ Returns the file path to the generated .wav file.
160
+ """
161
  try:
162
  if not script.strip():
163
+ return "Error: No script provided."
164
+
165
+ tts_model = get_tts_model(tts_model_name)
166
+
167
+ # Generate and save voice
168
+ output_path = os.path.join(tempfile.gettempdir(), "voice_over.wav")
169
+ tts_model.tts_to_file(text=script, file_path=output_path)
 
 
 
170
  return output_path
171
+
172
  except Exception as e:
173
+ return f"Error generating voice: {e}"
174
 
175
+
176
+ # ---------------------------------------------------------------------
177
+ # Music Generation Function
178
+ # ---------------------------------------------------------------------
179
+ @spaces.GPU(duration=100)
180
+ def generate_music(prompt: str, audio_length: int):
181
+ """
182
+ Generates music from the 'facebook/musicgen-large' model based on the prompt.
183
+ Returns the file path to the generated .wav file.
184
+ """
185
  try:
186
+ if not prompt.strip():
187
+ return "Error: No music suggestion provided."
188
+
189
+ model_key = "facebook/musicgen-large"
190
+ musicgen_model, musicgen_processor = get_musicgen_model(model_key)
191
+
192
  device = "cuda" if torch.cuda.is_available() else "cpu"
193
+ inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt").to(device)
194
+
195
+ with torch.inference_mode():
196
+ outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length)
197
+
198
+ audio_data = outputs[0, 0].cpu().numpy()
199
+ normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")
200
+
201
+ output_path = f"{tempfile.gettempdir()}/musicgen_large_generated_music.wav"
202
+ write(output_path, 44100, normalized_audio)
 
 
 
 
203
 
 
 
204
  return output_path
205
+
206
  except Exception as e:
207
+ return f"Error generating music: {e}"
208
+
209
 
210
+ # ---------------------------------------------------------------------
211
+ # Audio Blending with Duration Sync & Ducking
212
+ # ---------------------------------------------------------------------
213
+ @spaces.GPU(duration=100)
214
+ def blend_audio(voice_path: str, music_path: str, ducking: bool, duck_level: int = 10):
215
+ """
216
+ Blends two audio files (voice and music).
217
+ 1. If music < voice, loops the music until it meets/exceeds the voice duration.
218
+ 2. If music > voice, trims music to the voice duration.
219
+ 3. If ducking=True, the music is attenuated by 'duck_level' dB while the voice is playing.
220
+ Returns the file path to the blended .wav file.
221
+ """
222
  try:
223
+ if not os.path.isfile(voice_path) or not os.path.isfile(music_path):
224
+ return "Error: Missing audio files for blending."
225
+
226
  voice = AudioSegment.from_wav(voice_path)
227
  music = AudioSegment.from_wav(music_path)
228
+
229
+ voice_len = len(voice) # in milliseconds
230
+ music_len = len(music) # in milliseconds
231
+
232
+ # 1) If the music is shorter than the voice, loop it:
233
+ if music_len < voice_len:
234
+ looped_music = AudioSegment.empty()
235
+ # Keep appending until we exceed voice length
236
+ while len(looped_music) < voice_len:
237
+ looped_music += music
238
+ music = looped_music
239
+
240
+ # 2) If the music is longer than the voice, truncate it:
241
+ if len(music) > voice_len:
242
+ music = music[:voice_len]
243
+
244
+ # Now music and voice are the same length
245
  if ducking:
246
+ # Step 1: Reduce music dB while voice is playing
247
  ducked_music = music - duck_level
248
+ # Step 2: Overlay voice on top of ducked music
249
+ final_audio = ducked_music.overlay(voice)
250
  else:
251
+ # No ducking, just overlay
252
+ final_audio = music.overlay(voice)
253
+
254
+ output_path = os.path.join(tempfile.gettempdir(), "blended_output.wav")
255
+ final_audio.export(output_path, format="wav")
256
  return output_path
257
+
258
  except Exception as e:
259
+ return f"Error blending audio: {e}"
260
 
 
 
 
 
 
 
 
 
 
 
261
 
262
+ # ---------------------------------------------------------------------
263
+ # Gradio Interface
264
+ # ---------------------------------------------------------------------
265
+ with gr.Blocks() as demo:
266
  gr.Markdown("""
267
+ # 🎧 AI Promo Studio
268
+ Welcome to **AI Promo Studio**, your all-in-one solution for creating professional, engaging audio promos with minimal effort!
269
+
270
+ This next-generation platform uses powerful AI models to handle:
271
+ - **Script Generation**: Craft concise and impactful copy with LLaMA.
272
+ - **Voice Synthesis**: Convert text into natural-sounding voice-overs using Coqui TTS.
273
+ - **Music Production**: Generate custom music tracks with MusicGen Large for sound bed.
274
+ - **Seamless Blending**: Easily combine voice and music—loop or trim tracks to match your desired promo length, with optional ducking to keep the voice front and center.
275
+
276
+ Whether you’re a radio producer, podcaster, or content creator, **AI Promo Studio** streamlines your entire production pipeline—cutting hours of manual editing down to a few clicks.
277
+ """)
278
+
279
 
280
  with gr.Tabs():
281
+ # Step 1: Generate Script
282
+ with gr.Tab("Step 1: Generate Script"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  with gr.Row():
284
+ user_prompt = gr.Textbox(
285
+ label="Promo Idea",
286
+ placeholder="E.g., A 30-second promo for a morning show...",
287
+ lines=2
288
+ )
289
+ llama_model_id = gr.Textbox(
290
+ label="LLaMA Model ID",
291
+ value="meta-llama/Meta-Llama-3-8B-Instruct",
292
+ placeholder="Enter a valid Hugging Face model ID"
293
+ )
294
+ duration = gr.Slider(
295
+ label="Desired Promo Duration (seconds)",
296
+ minimum=15,
297
+ maximum=60,
298
+ step=15,
299
+ value=30
300
+ )
301
+
302
+ generate_script_button = gr.Button("Generate Script")
303
+ script_output = gr.Textbox(label="Generated Voice-Over Script", lines=5, interactive=False)
304
+ sound_design_output = gr.Textbox(label="Sound Design Suggestions", lines=3, interactive=False)
305
+ music_suggestion_output = gr.Textbox(label="Music Suggestions", lines=3, interactive=False)
306
+
307
+ generate_script_button.click(
308
+ fn=lambda user_prompt, model_id, dur: generate_script(user_prompt, model_id, HF_TOKEN, dur),
309
+ inputs=[user_prompt, llama_model_id, duration],
310
+ outputs=[script_output, sound_design_output, music_suggestion_output],
311
+ )
312
+
313
+ # Step 2: Generate Voice
314
+ with gr.Tab("Step 2: Generate Voice"):
315
+ gr.Markdown("Generate the voice-over using a Coqui TTS model.")
316
+ selected_tts_model = gr.Dropdown(
317
+ label="TTS Model",
318
+ choices=[
319
+ "tts_models/en/ljspeech/tacotron2-DDC",
320
+ "tts_models/en/ljspeech/vits",
321
+ "tts_models/en/sam/tacotron-DDC",
322
+ ],
323
+ value="tts_models/en/ljspeech/tacotron2-DDC",
324
+ multiselect=False
325
+ )
326
+ generate_voice_button = gr.Button("Generate Voice-Over")
327
+ voice_audio_output = gr.Audio(label="Voice-Over (WAV)", type="filepath")
328
+
329
+ generate_voice_button.click(
330
+ fn=lambda script, tts_model: generate_voice(script, tts_model),
331
+ inputs=[script_output, selected_tts_model],
332
+ outputs=voice_audio_output,
333
+ )
334
+
335
+ # Step 3: Generate Music (MusicGen Large)
336
+ with gr.Tab("Step 3: Generate Music"):
337
+ gr.Markdown("Generate a music track with the **MusicGen Large** model.")
338
+ audio_length = gr.Slider(
339
+ label="Music Length (tokens)",
340
+ minimum=128,
341
+ maximum=1024,
342
+ step=64,
343
+ value=512,
344
+ info="Increase tokens for longer audio, but be mindful of inference time."
345
+ )
346
+ generate_music_button = gr.Button("Generate Music")
347
+ music_output = gr.Audio(label="Generated Music (WAV)", type="filepath")
348
+
349
+ generate_music_button.click(
350
+ fn=lambda music_suggestion, length: generate_music(music_suggestion, length),
351
+ inputs=[music_suggestion_output, audio_length],
352
+ outputs=[music_output],
353
+ )
354
+
355
+ # Step 4: Blend Audio (Loop/Trim + Ducking)
356
+ with gr.Tab("Step 4: Blend Audio"):
357
+ gr.Markdown("**Music** will be looped or trimmed to match **Voice** duration, then optionally ducked.")
358
+ ducking_checkbox = gr.Checkbox(label="Enable Ducking?", value=True)
359
+ duck_level_slider = gr.Slider(
360
+ label="Ducking Level (dB attenuation)",
361
+ minimum=0,
362
+ maximum=20,
363
+ step=1,
364
+ value=10
365
+ )
366
+ blend_button = gr.Button("Blend Voice + Music")
367
+ blended_output = gr.Audio(label="Final Blended Output (WAV)", type="filepath")
368
 
369
+ blend_button.click(
370
+ fn=blend_audio,
371
+ inputs=[voice_audio_output, music_output, ducking_checkbox, duck_level_slider],
372
+ outputs=blended_output
373
+ )
374
+
375
+ # Footer
376
  gr.Markdown("""
377
+ <hr>
378
+ <p style="text-align: center; font-size: 0.9em;">
379
+ Created with ❤️ by <a href="https://bilsimaging.com" target="_blank">bilsimaging.com</a>
380
+ </p>
 
 
 
 
381
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
 
383
+ # Visitor Badge
384
+ gr.HTML("""
385
+ <a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold">
386
+ <img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2Fradiogold&countColor=%23263759" />
387
+ </a>
388
+ """)
389
 
390
+ demo.launch(debug=True)