Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,8 @@ import google.generativeai as genai
|
|
8 |
import re
|
9 |
import logging
|
10 |
import numpy as np
|
|
|
|
|
11 |
|
12 |
logging.basicConfig(level=logging.INFO)
|
13 |
logger = logging.getLogger(__name__)
|
@@ -133,6 +135,7 @@ def redistribute_codes(code_list, snac_model):
|
|
133 |
audio_hat = snac_model.decode(codes)
|
134 |
return audio_hat.detach().squeeze().cpu().numpy() # Always return CPU numpy array
|
135 |
|
|
|
136 |
@spaces.GPU()
|
137 |
@spaces.GPU()
|
138 |
def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, num_hosts, progress=gr.Progress()):
|
@@ -140,6 +143,9 @@ def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty
|
|
140 |
return None
|
141 |
|
142 |
try:
|
|
|
|
|
|
|
143 |
progress(0.1, "Processing text...")
|
144 |
lines = text.split('\n')
|
145 |
audio_samples = []
|
@@ -179,12 +185,26 @@ def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty
|
|
179 |
# Concatenate all audio samples
|
180 |
final_audio = np.concatenate(audio_samples)
|
181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
# Add a check for 15-second limitation
|
183 |
max_samples = 24000 * 15 # 15 seconds at 24kHz sample rate
|
184 |
-
if len(
|
185 |
-
|
186 |
|
187 |
-
return (24000,
|
188 |
except Exception as e:
|
189 |
print(f"Error generating speech: {e}")
|
190 |
return None
|
|
|
8 |
import re
|
9 |
import logging
|
10 |
import numpy as np
|
11 |
+
from pydub import AudioSegment
|
12 |
+
import io
|
13 |
|
14 |
logging.basicConfig(level=logging.INFO)
|
15 |
logger = logging.getLogger(__name__)
|
|
|
135 |
audio_hat = snac_model.decode(codes)
|
136 |
return audio_hat.detach().squeeze().cpu().numpy() # Always return CPU numpy array
|
137 |
|
138 |
+
@spaces.GPU()
|
139 |
@spaces.GPU()
|
140 |
@spaces.GPU()
|
141 |
def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, num_hosts, progress=gr.Progress()):
|
|
|
143 |
return None
|
144 |
|
145 |
try:
|
146 |
+
# Load the intro/outro music
|
147 |
+
music = AudioSegment.from_mp3("Maiko-intro-outro.mp3")
|
148 |
+
|
149 |
progress(0.1, "Processing text...")
|
150 |
lines = text.split('\n')
|
151 |
audio_samples = []
|
|
|
185 |
# Concatenate all audio samples
|
186 |
final_audio = np.concatenate(audio_samples)
|
187 |
|
188 |
+
# Convert numpy array to AudioSegment
|
189 |
+
speech_audio = AudioSegment(
|
190 |
+
final_audio.tobytes(),
|
191 |
+
frame_rate=24000,
|
192 |
+
sample_width=final_audio.dtype.itemsize,
|
193 |
+
channels=1
|
194 |
+
)
|
195 |
+
|
196 |
+
# Combine intro, speech, and outro
|
197 |
+
combined_audio = music + speech_audio + music
|
198 |
+
|
199 |
+
# Convert back to numpy array
|
200 |
+
combined_numpy = np.array(combined_audio.get_array_of_samples())
|
201 |
+
|
202 |
# Add a check for 15-second limitation
|
203 |
max_samples = 24000 * 15 # 15 seconds at 24kHz sample rate
|
204 |
+
if len(combined_numpy) > max_samples:
|
205 |
+
combined_numpy = combined_numpy[:max_samples]
|
206 |
|
207 |
+
return (24000, combined_numpy)
|
208 |
except Exception as e:
|
209 |
print(f"Error generating speech: {e}")
|
210 |
return None
|