cymic commited on
Commit
804432e
·
1 Parent(s): 778d33f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -4
app.py CHANGED
@@ -1,7 +1,92 @@
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
1
+ import os
2
+
3
+ os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')
4
+
5
+ import librosa
6
+ import numpy as np
7
+ import torch
8
+ from torch import no_grad, LongTensor
9
+ import commons
10
+ import utils
11
  import gradio as gr
12
+ from models import SynthesizerTrn
13
+ from text import text_to_sequence
14
+ from mel_processing import spectrogram_torch
15
+
16
+
17
+ def get_text(text):
18
+ text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
19
+ if hps.data.add_blank:
20
+ text_norm = commons.intersperse(text_norm, 0)
21
+ text_norm = LongTensor(text_norm)
22
+ return text_norm
23
+
24
+
25
+ def tts_fn(text, speaker_id):
26
+ if len(text) > 150:
27
+ return "Error: Text is too long", None
28
+ stn_tst = get_text(text)
29
+ with no_grad():
30
+ x_tst = stn_tst.unsqueeze(0)
31
+ x_tst_lengths = LongTensor([stn_tst.size(0)])
32
+ sid = LongTensor([speaker_id])
33
+ audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][
34
+ 0, 0].data.cpu().float().numpy()
35
+ return "Success", (hps.data.sampling_rate, audio)
36
+
37
+
38
+ def vc_fn(original_speaker_id, target_speaker_id, input_audio):
39
+ if input_audio is None:
40
+ return "You need to upload an audio", None
41
+ sampling_rate, audio = input_audio
42
+ duration = audio.shape[0] / sampling_rate
43
+ if duration > 30:
44
+ return "Error: Audio is too long", None
45
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
46
+ if len(audio.shape) > 1:
47
+ audio = librosa.to_mono(audio.transpose(1, 0))
48
+ if sampling_rate != hps.data.sampling_rate:
49
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
50
+ y = torch.FloatTensor(audio)
51
+ y = y.unsqueeze(0)
52
+ spec = spectrogram_torch(y, hps.data.filter_length,
53
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
54
+ center=False)
55
+ spec_lengths = LongTensor([spec.size(-1)])
56
+ sid_src = LongTensor([original_speaker_id])
57
+ sid_tgt = LongTensor([target_speaker_id])
58
+ with no_grad():
59
+ audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
60
+ 0, 0].data.cpu().float().numpy()
61
+ return "Success", (hps.data.sampling_rate, audio)
62
+
63
+
64
+ if __name__ == '__main__':
65
+ config_path = "saved_model/config.json"
66
+ model_path = "saved_model/model.pth"
67
+ hps = utils.get_hparams_from_file(config_path)
68
+ model = SynthesizerTrn(
69
+ len(hps.symbols),
70
+ hps.data.filter_length // 2 + 1,
71
+ hps.train.segment_size // hps.data.hop_length,
72
+ n_speakers=hps.data.n_speakers,
73
+ **hps.model)
74
+ utils.load_checkpoint(model_path, model, None)
75
+ model.eval()
76
+
77
+ app = gr.Blocks()
78
+
79
+ with app:
80
+ with gr.Tabs():
81
+ with gr.TabItem("TTS"):
82
+ with gr.Column():
83
+ tts_input1 = gr.TextArea(label="Text (150 words limitation)", value="こんにちは。")
84
+ tts_input2 = gr.Dropdown(label="Speaker", choices=hps.speakers, type="index", value=hps.speakers[0])
85
+ tts_submit = gr.Button("Generate", variant="primary")
86
+ tts_output1 = gr.Textbox(label="Output Message")
87
+ tts_output2 = gr.Audio(label="Output Audio")
88
 
89
+ tts_submit.click(tts_fn, [tts_input1, tts_input2], [tts_output1, tts_output2])
90
+ vc_submit.click(vc_fn, [vc_input1, vc_input2, vc_input3], [vc_output1, vc_output2])
91
 
92
+ app.launch()