bluenevus commited on
Commit
841bbb9
·
verified ·
1 Parent(s): d9cc1e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -3
app.py CHANGED
@@ -8,6 +8,8 @@ import google.generativeai as genai
8
  import re
9
  import logging
10
  import numpy as np
 
 
11
 
12
  logging.basicConfig(level=logging.INFO)
13
  logger = logging.getLogger(__name__)
@@ -133,6 +135,7 @@ def redistribute_codes(code_list, snac_model):
133
  audio_hat = snac_model.decode(codes)
134
  return audio_hat.detach().squeeze().cpu().numpy() # Always return CPU numpy array
135
 
 
136
  @spaces.GPU()
137
  @spaces.GPU()
138
  def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, num_hosts, progress=gr.Progress()):
@@ -140,6 +143,9 @@ def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty
140
  return None
141
 
142
  try:
 
 
 
143
  progress(0.1, "Processing text...")
144
  lines = text.split('\n')
145
  audio_samples = []
@@ -179,12 +185,26 @@ def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty
179
  # Concatenate all audio samples
180
  final_audio = np.concatenate(audio_samples)
181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  # Add a check for 15-second limitation
183
  max_samples = 24000 * 15 # 15 seconds at 24kHz sample rate
184
- if len(final_audio) > max_samples:
185
- final_audio = final_audio[:max_samples]
186
 
187
- return (24000, final_audio)
188
  except Exception as e:
189
  print(f"Error generating speech: {e}")
190
  return None
 
8
  import re
9
  import logging
10
  import numpy as np
11
+ from pydub import AudioSegment
12
+ import io
13
 
14
  logging.basicConfig(level=logging.INFO)
15
  logger = logging.getLogger(__name__)
 
135
  audio_hat = snac_model.decode(codes)
136
  return audio_hat.detach().squeeze().cpu().numpy() # Always return CPU numpy array
137
 
138
+ @spaces.GPU()
139
  @spaces.GPU()
140
  @spaces.GPU()
141
  def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, num_hosts, progress=gr.Progress()):
 
143
  return None
144
 
145
  try:
146
+ # Load the intro/outro music
147
+ music = AudioSegment.from_mp3("Maiko-intro-outro.mp3")
148
+
149
  progress(0.1, "Processing text...")
150
  lines = text.split('\n')
151
  audio_samples = []
 
185
  # Concatenate all audio samples
186
  final_audio = np.concatenate(audio_samples)
187
 
188
+ # Convert numpy array to AudioSegment
189
+ speech_audio = AudioSegment(
190
+ final_audio.tobytes(),
191
+ frame_rate=24000,
192
+ sample_width=final_audio.dtype.itemsize,
193
+ channels=1
194
+ )
195
+
196
+ # Combine intro, speech, and outro
197
+ combined_audio = music + speech_audio + music
198
+
199
+ # Convert back to numpy array
200
+ combined_numpy = np.array(combined_audio.get_array_of_samples())
201
+
202
  # Add a check for 15-second limitation
203
  max_samples = 24000 * 15 # 15 seconds at 24kHz sample rate
204
+ if len(combined_numpy) > max_samples:
205
+ combined_numpy = combined_numpy[:max_samples]
206
 
207
+ return (24000, combined_numpy)
208
  except Exception as e:
209
  print(f"Error generating speech: {e}")
210
  return None