bluenevus commited on
Commit
f4fb1c0
·
verified ·
1 Parent(s): 897a611

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -8
app.py CHANGED
@@ -15,6 +15,7 @@ from pydub import AudioSegment
15
  from docx import Document
16
  import PyPDF2
17
  from tqdm import tqdm
 
18
 
19
  # Initialize logging
20
  logging.basicConfig(level=logging.INFO)
@@ -191,16 +192,22 @@ def generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p,
191
 
192
  paragraph_audio = paragraph_audio.cpu().numpy().flatten()
193
 
194
- silences = detect_silence(paragraph_audio)
195
- if silences:
196
- paragraph_audio = paragraph_audio[:silences[-1][1]]
197
 
198
  audio_samples.append(paragraph_audio)
199
 
200
  final_audio = np.concatenate(audio_samples)
201
- final_audio = np.int16(final_audio / np.max(np.abs(final_audio)) * 32767)
202
 
203
- return final_audio
 
 
 
 
 
 
 
 
204
  except Exception as e:
205
  logger.error(f"Error generating speech: {str(e)}")
206
  return None
@@ -320,11 +327,11 @@ def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_se
320
  if not script_output.strip():
321
  return dash.no_update, html.Div("No audio generated yet."), dash.no_update, dash.no_update, "", ""
322
 
323
- final_audio = generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p, repetition_penalty, max_new_tokens)
324
 
325
- if final_audio is not None:
326
  # Convert to base64 for audio playback
327
- audio_base64 = base64.b64encode(final_audio.tobytes()).decode('utf-8')
328
  src = f"data:audio/wav;base64,{audio_base64}"
329
 
330
  # Create a download link for the audio
 
15
  from docx import Document
16
  import PyPDF2
17
  from tqdm import tqdm
18
+ import soundfile as sf
19
 
20
  # Initialize logging
21
  logging.basicConfig(level=logging.INFO)
 
192
 
193
  paragraph_audio = paragraph_audio.cpu().numpy().flatten()
194
 
195
+ # Normalize audio to [-1, 1] range
196
+ paragraph_audio = paragraph_audio / np.max(np.abs(paragraph_audio))
 
197
 
198
  audio_samples.append(paragraph_audio)
199
 
200
  final_audio = np.concatenate(audio_samples)
 
201
 
202
+ # Convert to 16-bit PCM
203
+ final_audio = (final_audio * 32767).astype(np.int16)
204
+
205
+ # Save as WAV file in memory
206
+ buffer = io.BytesIO()
207
+ sf.write(buffer, final_audio, 24000, format='WAV', subtype='PCM_16')
208
+ buffer.seek(0)
209
+
210
+ return buffer
211
  except Exception as e:
212
  logger.error(f"Error generating speech: {str(e)}")
213
  return None
 
327
  if not script_output.strip():
328
  return dash.no_update, html.Div("No audio generated yet."), dash.no_update, dash.no_update, "", ""
329
 
330
+ audio_buffer = generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p, repetition_penalty, max_new_tokens)
331
 
332
+ if audio_buffer is not None:
333
  # Convert to base64 for audio playback
334
+ audio_base64 = base64.b64encode(audio_buffer.getvalue()).decode('utf-8')
335
  src = f"data:audio/wav;base64,{audio_base64}"
336
 
337
  # Create a download link for the audio