Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from inference.infer_tool import Svc
|
2 |
+
from vextract.vocal_extract import VEX
|
3 |
+
import gradio as gr
|
4 |
+
import os
|
5 |
+
|
6 |
+
|
7 |
+
# os.environ['CUDA_VISIBLE_DEVICES'] = '1,2'
|
8 |
+
|
9 |
+
|
10 |
+
class VitsGradio:
|
11 |
+
def __init__(self):
|
12 |
+
self.so = Svc()
|
13 |
+
self.v = VEX()
|
14 |
+
self.lspk = []
|
15 |
+
self.modelPaths = []
|
16 |
+
for root, dirs, files in os.walk("checkpoints"):
|
17 |
+
for dir in dirs:
|
18 |
+
self.modelPaths.append(dir)
|
19 |
+
with gr.Blocks(title="Sovits Singing Synthesis Tool") as self.Vits:
|
20 |
+
gr.Markdown(
|
21 |
+
"""
|
22 |
+
# Singing Synthesis Tool
|
23 |
+
- Please select the voice model, device, and operating mode in sequence, then click "Load Model"
|
24 |
+
- The input audio needs to be clean vocals
|
25 |
+
"""
|
26 |
+
)
|
27 |
+
with gr.Tab("Vocal Extraction"):
|
28 |
+
with gr.Row():
|
29 |
+
with gr.Column():
|
30 |
+
sample_audio = gr.Audio(label="Input Audio")
|
31 |
+
extractAudioBtn = gr.Button("Extract Vocals")
|
32 |
+
with gr.Row():
|
33 |
+
with gr.Column():
|
34 |
+
self.sample_vocal_output = gr.Audio(label="Output Audio")
|
35 |
+
self.sample_accompaniment_output = gr.Audio()
|
36 |
+
extractAudioBtn.click(self.v.separate, inputs=[sample_audio],
|
37 |
+
outputs=[self.sample_vocal_output, self.sample_accompaniment_output],
|
38 |
+
show_progress=True, api_name="extract")
|
39 |
+
with gr.Tab("Singing Synthesis"):
|
40 |
+
with gr.Row(visible=False) as self.VoiceConversion:
|
41 |
+
with gr.Column():
|
42 |
+
with gr.Row():
|
43 |
+
with gr.Column():
|
44 |
+
self.srcaudio = gr.Audio(label="Input Audio")
|
45 |
+
self.btnVC = gr.Button("Speaker Conversion")
|
46 |
+
with gr.Column():
|
47 |
+
with gr.Row():
|
48 |
+
with gr.Column():
|
49 |
+
self.dsid0 = gr.Dropdown(label="Target Character", choices=self.lspk)
|
50 |
+
self.tran = gr.Slider(label="Pitch Shift", maximum=60, minimum=-60, step=1, value=0)
|
51 |
+
self.th = gr.Slider(label="Slice Threshold", maximum=32767, minimum=-32768, step=0.1,
|
52 |
+
value=-40)
|
53 |
+
self.ns = gr.Slider(label="Noise Level", maximum=1.0, minimum=0.0, step=0.1,
|
54 |
+
value=0.4)
|
55 |
+
with gr.Row():
|
56 |
+
self.VCOutputs = gr.Audio()
|
57 |
+
self.btnVC.click(self.so.inference, inputs=[self.srcaudio, self.dsid0, self.tran, self.th, self.ns],
|
58 |
+
outputs=[self.VCOutputs], show_progress=True, api_name="run")
|
59 |
+
|
60 |
+
with gr.Row(visible=False) as self.VoiceBatchConversion:
|
61 |
+
with gr.Column():
|
62 |
+
with gr.Row():
|
63 |
+
with gr.Column():
|
64 |
+
self.srcaudio = gr.Files(label="Upload Multiple Audio Files", file_types=['.wav'],
|
65 |
+
interactive=True)
|
66 |
+
self.btnVC = gr.Button("Speaker Conversion")
|
67 |
+
with gr.Column():
|
68 |
+
with gr.Row():
|
69 |
+
with gr.Column():
|
70 |
+
self.dsid1 = gr.Dropdown(label="Target Character", choices=self.lspk)
|
71 |
+
self.tran = gr.Slider(label="Pitch Shift", maximum=60, minimum=-60, step=1, value=0)
|
72 |
+
self.th = gr.Slider(label="Slice Threshold", maximum=32767, minimum=-32768, step=0.1,
|
73 |
+
value=-40)
|
74 |
+
self.ns = gr.Slider(label="Noise Level", maximum=1.0, minimum=0.0, step=0.1,
|
75 |
+
value=0.4)
|
76 |
+
with gr.Row():
|
77 |
+
self.VCOutputs = gr.File(label="Output Zip File", interactive=False)
|
78 |
+
self.btnVC.click(self.batch_inference, inputs=[self.srcaudio, self.dsid1, self.tran, self.th, self.ns],
|
79 |
+
outputs=[self.VCOutputs], show_progress=True, api_name="batch")
|
80 |
+
|
81 |
+
with gr.Row():
|
82 |
+
with gr.Column():
|
83 |
+
modelstrs = gr.Dropdown(label="Model", choices=self.modelPaths, value=self.modelPaths[0],
|
84 |
+
type="value")
|
85 |
+
devicestrs = gr.Dropdown(label="Device", choices=["cpu", "cuda"], value="cuda", type="value")
|
86 |
+
isbatchmod = gr.Radio(label="Operating Mode", choices=["single", "batch"], value="single",
|
87 |
+
info="single: Single file processing. batch: Batch processing supports uploading multiple files")
|
88 |
+
btnMod = gr.Button("Load Model")
|
89 |
+
btnMod.click(self.loadModel, inputs=[modelstrs, devicestrs, isbatchmod],
|
90 |
+
outputs=[self.dsid0, self.dsid1, self.VoiceConversion, self.VoiceBatchConversion],
|
91 |
+
show_progress=True, api_name="switch")
|
92 |
+
|
93 |
+
def batch_inference(self, files, chara, tran, slice_db, ns, progress=gr.Progress()):
|
94 |
+
from zipfile import ZipFile
|
95 |
+
from scipy.io import wavfile
|
96 |
+
import uuid
|
97 |
+
|
98 |
+
temp_directory = "temp"
|
99 |
+
if not os.path.exists(temp_directory):
|
100 |
+
os.mkdir(temp_directory)
|
101 |
+
|
102 |
+
progress(0.00, desc="Initializing Directory")
|
103 |
+
tmp_workdir_name = f"{temp_directory}/batch_{uuid.uuid4()}"
|
104 |
+
if not os.path.exists(tmp_workdir_name):
|
105 |
+
os.mkdir(tmp_workdir_name)
|
106 |
+
|
107 |
+
progress(0.10, desc="Initializing Directory")
|
108 |
+
|
109 |
+
output_files = []
|
110 |
+
|
111 |
+
for idx, file in enumerate(files):
|
112 |
+
filename = os.path.basename(file.name)
|
113 |
+
progress(0.10 + (0.70 / float(len(files))) * (idx + 1.00), desc=f"Processing Audio {(idx + 1)}/{len(files)}: {filename}")
|
114 |
+
print(f"{idx}, {file}, {filename}")
|
115 |
+
sampling_rate, audio = wavfile.read(file.name)
|
116 |
+
output_sampling_rate, output_audio = self.so.inference((sampling_rate, audio), chara=chara, tran=tran,
|
117 |
+
slice_db=slice_db, ns=ns)
|
118 |
+
new_filepath = f"{tmp_workdir_name}/{filename}"
|
119 |
+
wavfile.write(filename=new_filepath, rate=output_sampling_rate, data=output_audio)
|
120 |
+
output_files.append(new_filepath)
|
121 |
+
|
122 |
+
progress(0.70, desc="Audio Processing Complete")
|
123 |
+
|
124 |
+
zipfilename = f"{tmp_workdir_name}/output.zip"
|
125 |
+
with ZipFile(zipfilename, "w") as zip_obj:
|
126 |
+
for idx, filepath in enumerate(output_files):
|
127 |
+
zip_obj.write(filepath, os.path.basename(filepath))
|
128 |
+
progress(0.80, desc="Compression Complete")
|
129 |
+
# todo: remove data
|
130 |
+
progress(1.00, desc="Cleaning Up")
|
131 |
+
return zipfilename
|
132 |
+
|
133 |
+
def loadModel(self, path, device, process_mode):
|
134 |
+
self.lspk = []
|
135 |
+
print(f"path: {path}, device: {device}")
|
136 |
+
self.so.set_device(device)
|
137 |
+
print(f"device set.")
|
138 |
+
self.so.load_checkpoint(path)
|
139 |
+
print(f"checkpoint loaded")
|
140 |
+
for spk, sid in self.so.hps_ms.spk.items():
|
141 |
+
self.lspk.append(spk)
|
142 |
+
print(f"LSPK: {self.lspk}")
|
143 |
+
if process_mode == "single":
|
144 |
+
VChange = gr.update(visible=True)
|
145 |
+
VBChange = gr.update(visible=False)
|
146 |
+
else:
|
147 |
+
VChange = gr.update(visible=False)
|
148 |
+
VBChange = gr.update(visible=True)
|
149 |
+
SD0Change = gr.update(choices=self.lspk, value=self.lspk[0])
|
150 |
+
SD1Change = gr.update(choices=self.lspk, value=self.lspk[0])
|
151 |
+
print("All set. Updating display")
|
152 |
+
return [SD0Change, SD1Change, VChange, VBChange]
|
153 |
+
|
154 |
+
|
155 |
+
if __name__ == "__main__":
|
156 |
+
grVits = VitsGradio()
|
157 |
+
grVits.Vits\
|
158 |
+
.queue(concurrency_count=20, status_update_rate=5.0)\
|
159 |
+
.launch(server_port=7870, share=True, show_api=False)
|