emirhanbilgic commited on
Commit
5f01cca
·
verified ·
1 Parent(s): 7ca15a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -7
app.py CHANGED
@@ -116,22 +116,21 @@ def text_to_speech(text, audio_file=None):
116
  # Normalize the input text
117
  normalized_text = normalize_text(text)
118
 
 
119
  inputs = processor(text=normalized_text, return_tensors="pt").to(device)
120
 
 
121
  speaker_embeddings = default_embedding
122
 
123
  # Generate speech
124
- speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder)
 
125
 
126
  # Convert the generated speech to numpy array format
127
  speech_np = speech.cpu().numpy()
128
 
129
- # Write the output to a temporary file
130
- output_file = "output.wav"
131
- sf.write(output_file, speech_np, samplerate=16000)
132
-
133
  # Return the numpy array and the sample rate
134
- return speech_np, 16000
135
 
136
  iface = gr.Interface(
137
  fn=text_to_speech,
@@ -145,4 +144,4 @@ iface = gr.Interface(
145
  description="Enter Turkish text, optionally upload a short audio sample of the target speaker, and listen to the generated speech using the fine-tuned SpeechT5 model."
146
  )
147
 
148
- iface.launch(share=True)
 
116
  # Normalize the input text
117
  normalized_text = normalize_text(text)
118
 
119
+ # Prepare the input for the model
120
  inputs = processor(text=normalized_text, return_tensors="pt").to(device)
121
 
122
+ # Use the default speaker embedding
123
  speaker_embeddings = default_embedding
124
 
125
  # Generate speech
126
+ with torch.no_grad():
127
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder)
128
 
129
  # Convert the generated speech to numpy array format
130
  speech_np = speech.cpu().numpy()
131
 
 
 
 
 
132
  # Return the numpy array and the sample rate
133
+ return (speech_np, 16000)
134
 
135
  iface = gr.Interface(
136
  fn=text_to_speech,
 
144
  description="Enter Turkish text, optionally upload a short audio sample of the target speaker, and listen to the generated speech using the fine-tuned SpeechT5 model."
145
  )
146
 
147
+ iface.launch(share=True)