nevreal commited on
Commit
080be53
·
verified ·
1 Parent(s): 1ecc8c1

Delete api_240604.py

Browse files
Files changed (1) hide show
  1. api_240604.py +0 -565
api_240604.py DELETED
@@ -1,565 +0,0 @@
1
- #api for 240604 release version by Xiaokai
2
- import os
3
- import sys
4
- import json
5
- import re
6
- import time
7
- import librosa
8
- import torch
9
- import numpy as np
10
- import torch.nn.functional as F
11
- import torchaudio.transforms as tat
12
- import sounddevice as sd
13
- from dotenv import load_dotenv
14
- from fastapi import FastAPI, HTTPException
15
- from pydantic import BaseModel
16
- import threading
17
- import uvicorn
18
- import logging
19
- from multiprocessing import Queue, Process, cpu_count, freeze_support
20
-
21
- # Initialize the logger
22
- logging.basicConfig(level=logging.INFO)
23
- logger = logging.getLogger(__name__)
24
-
25
- # Define FastAPI app
26
- app = FastAPI()
27
-
28
- class GUIConfig:
29
- def __init__(self) -> None:
30
- self.pth_path: str = ""
31
- self.index_path: str = ""
32
- self.pitch: int = 0
33
- self.formant: float = 0.0
34
- self.sr_type: str = "sr_model"
35
- self.block_time: float = 0.25 # s
36
- self.threhold: int = -60
37
- self.crossfade_time: float = 0.05
38
- self.extra_time: float = 2.5
39
- self.I_noise_reduce: bool = False
40
- self.O_noise_reduce: bool = False
41
- self.use_pv: bool = False
42
- self.rms_mix_rate: float = 0.0
43
- self.index_rate: float = 0.0
44
- self.n_cpu: int = 4
45
- self.f0method: str = "fcpe"
46
- self.sg_input_device: str = ""
47
- self.sg_output_device: str = ""
48
-
49
- class ConfigData(BaseModel):
50
- pth_path: str
51
- index_path: str
52
- sg_input_device: str
53
- sg_output_device: str
54
- threhold: int = -60
55
- pitch: int = 0
56
- formant: float = 0.0
57
- index_rate: float = 0.3
58
- rms_mix_rate: float = 0.0
59
- block_time: float = 0.25
60
- crossfade_length: float = 0.05
61
- extra_time: float = 2.5
62
- n_cpu: int = 4
63
- I_noise_reduce: bool = False
64
- O_noise_reduce: bool = False
65
- use_pv: bool = False
66
- f0method: str = "fcpe"
67
-
68
- class Harvest(Process):
69
- def __init__(self, inp_q, opt_q):
70
- super(Harvest, self).__init__()
71
- self.inp_q = inp_q
72
- self.opt_q = opt_q
73
-
74
- def run(self):
75
- import numpy as np
76
- import pyworld
77
- while True:
78
- idx, x, res_f0, n_cpu, ts = self.inp_q.get()
79
- f0, t = pyworld.harvest(
80
- x.astype(np.double),
81
- fs=16000,
82
- f0_ceil=1100,
83
- f0_floor=50,
84
- frame_period=10,
85
- )
86
- res_f0[idx] = f0
87
- if len(res_f0.keys()) >= n_cpu:
88
- self.opt_q.put(ts)
89
-
90
- class AudioAPI:
91
- def __init__(self) -> None:
92
- self.gui_config = GUIConfig()
93
- self.config = None # Initialize Config object as None
94
- self.flag_vc = False
95
- self.function = "vc"
96
- self.delay_time = 0
97
- self.rvc = None # Initialize RVC object as None
98
- self.inp_q = None
99
- self.opt_q = None
100
- self.n_cpu = min(cpu_count(), 8)
101
-
102
- def initialize_queues(self):
103
- self.inp_q = Queue()
104
- self.opt_q = Queue()
105
- for _ in range(self.n_cpu):
106
- p = Harvest(self.inp_q, self.opt_q)
107
- p.daemon = True
108
- p.start()
109
-
110
- def load(self):
111
- input_devices, output_devices, _, _ = self.get_devices()
112
- try:
113
- with open("configs/config.json", "r", encoding='utf-8') as j:
114
- data = json.load(j)
115
- if data["sg_input_device"] not in input_devices:
116
- data["sg_input_device"] = input_devices[sd.default.device[0]]
117
- if data["sg_output_device"] not in output_devices:
118
- data["sg_output_device"] = output_devices[sd.default.device[1]]
119
- except Exception as e:
120
- logger.error(f"Failed to load configuration: {e}")
121
- with open("configs/config.json", "w", encoding='utf-8') as j:
122
- data = {
123
- "pth_path": "",
124
- "index_path": "",
125
- "sg_input_device": input_devices[sd.default.device[0]],
126
- "sg_output_device": output_devices[sd.default.device[1]],
127
- "threhold": -60,
128
- "pitch": 0,
129
- "formant": 0.0,
130
- "index_rate": 0,
131
- "rms_mix_rate": 0,
132
- "block_time": 0.25,
133
- "crossfade_length": 0.05,
134
- "extra_time": 2.5,
135
- "n_cpu": 4,
136
- "f0method": "fcpe",
137
- "use_jit": False,
138
- "use_pv": False,
139
- }
140
- json.dump(data, j, ensure_ascii=False)
141
- return data
142
-
143
- def set_values(self, values):
144
- logger.info(f"Setting values: {values}")
145
- if not values.pth_path.strip():
146
- raise HTTPException(status_code=400, detail="Please select a .pth file")
147
- if not values.index_path.strip():
148
- raise HTTPException(status_code=400, detail="Please select an index file")
149
- self.set_devices(values.sg_input_device, values.sg_output_device)
150
- self.config.use_jit = False
151
- self.gui_config.pth_path = values.pth_path
152
- self.gui_config.index_path = values.index_path
153
- self.gui_config.threhold = values.threhold
154
- self.gui_config.pitch = values.pitch
155
- self.gui_config.formant = values.formant
156
- self.gui_config.block_time = values.block_time
157
- self.gui_config.crossfade_time = values.crossfade_length
158
- self.gui_config.extra_time = values.extra_time
159
- self.gui_config.I_noise_reduce = values.I_noise_reduce
160
- self.gui_config.O_noise_reduce = values.O_noise_reduce
161
- self.gui_config.rms_mix_rate = values.rms_mix_rate
162
- self.gui_config.index_rate = values.index_rate
163
- self.gui_config.n_cpu = values.n_cpu
164
- self.gui_config.use_pv = values.use_pv
165
- self.gui_config.f0method = values.f0method
166
- return True
167
-
168
- def start_vc(self):
169
- torch.cuda.empty_cache()
170
- self.flag_vc = True
171
- self.rvc = rvc_for_realtime.RVC(
172
- self.gui_config.pitch,
173
- self.gui_config.pth_path,
174
- self.gui_config.index_path,
175
- self.gui_config.index_rate,
176
- self.gui_config.n_cpu,
177
- self.inp_q,
178
- self.opt_q,
179
- self.config,
180
- self.rvc if self.rvc else None,
181
- )
182
- self.gui_config.samplerate = (
183
- self.rvc.tgt_sr
184
- if self.gui_config.sr_type == "sr_model"
185
- else self.get_device_samplerate()
186
- )
187
- self.zc = self.gui_config.samplerate // 100
188
- self.block_frame = (
189
- int(
190
- np.round(
191
- self.gui_config.block_time
192
- * self.gui_config.samplerate
193
- / self.zc
194
- )
195
- )
196
- * self.zc
197
- )
198
- self.block_frame_16k = 160 * self.block_frame // self.zc
199
- self.crossfade_frame = (
200
- int(
201
- np.round(
202
- self.gui_config.crossfade_time
203
- * self.gui_config.samplerate
204
- / self.zc
205
- )
206
- )
207
- * self.zc
208
- )
209
- self.sola_buffer_frame = min(self.crossfade_frame, 4 * self.zc)
210
- self.sola_search_frame = self.zc
211
- self.extra_frame = (
212
- int(
213
- np.round(
214
- self.gui_config.extra_time
215
- * self.gui_config.samplerate
216
- / self.zc
217
- )
218
- )
219
- * self.zc
220
- )
221
- self.input_wav = torch.zeros(
222
- self.extra_frame
223
- + self.crossfade_frame
224
- + self.sola_search_frame
225
- + self.block_frame,
226
- device=self.config.device,
227
- dtype=torch.float32,
228
- )
229
- self.input_wav_denoise = self.input_wav.clone()
230
- self.input_wav_res = torch.zeros(
231
- 160 * self.input_wav.shape[0] // self.zc,
232
- device=self.config.device,
233
- dtype=torch.float32,
234
- )
235
- self.rms_buffer = np.zeros(4 * self.zc, dtype="float32")
236
- self.sola_buffer = torch.zeros(
237
- self.sola_buffer_frame, device=self.config.device, dtype=torch.float32
238
- )
239
- self.nr_buffer = self.sola_buffer.clone()
240
- self.output_buffer = self.input_wav.clone()
241
- self.skip_head = self.extra_frame // self.zc
242
- self.return_length = (
243
- self.block_frame + self.sola_buffer_frame + self.sola_search_frame
244
- ) // self.zc
245
- self.fade_in_window = (
246
- torch.sin(
247
- 0.5
248
- * np.pi
249
- * torch.linspace(
250
- 0.0,
251
- 1.0,
252
- steps=self.sola_buffer_frame,
253
- device=self.config.device,
254
- dtype=torch.float32,
255
- )
256
- )
257
- ** 2
258
- )
259
- self.fade_out_window = 1 - self.fade_in_window
260
- self.resampler = tat.Resample(
261
- orig_freq=self.gui_config.samplerate,
262
- new_freq=16000,
263
- dtype=torch.float32,
264
- ).to(self.config.device)
265
- if self.rvc.tgt_sr != self.gui_config.samplerate:
266
- self.resampler2 = tat.Resample(
267
- orig_freq=self.rvc.tgt_sr,
268
- new_freq=self.gui_config.samplerate,
269
- dtype=torch.float32,
270
- ).to(self.config.device)
271
- else:
272
- self.resampler2 = None
273
- self.tg = TorchGate(
274
- sr=self.gui_config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9
275
- ).to(self.config.device)
276
- thread_vc = threading.Thread(target=self.soundinput)
277
- thread_vc.start()
278
-
279
- def soundinput(self):
280
- channels = 1 if sys.platform == "darwin" else 2
281
- with sd.Stream(
282
- channels=channels,
283
- callback=self.audio_callback,
284
- blocksize=self.block_frame,
285
- samplerate=self.gui_config.samplerate,
286
- dtype="float32",
287
- ) as stream:
288
- global stream_latency
289
- stream_latency = stream.latency[-1]
290
- while self.flag_vc:
291
- time.sleep(self.gui_config.block_time)
292
- logger.info("Audio block passed.")
293
- logger.info("Ending VC")
294
-
295
- def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times, status):
296
- start_time = time.perf_counter()
297
- indata = librosa.to_mono(indata.T)
298
- if self.gui_config.threhold > -60:
299
- indata = np.append(self.rms_buffer, indata)
300
- rms = librosa.feature.rms(y=indata, frame_length=4 * self.zc, hop_length=self.zc)[:, 2:]
301
- self.rms_buffer[:] = indata[-4 * self.zc :]
302
- indata = indata[2 * self.zc - self.zc // 2 :]
303
- db_threhold = (
304
- librosa.amplitude_to_db(rms, ref=1.0)[0] < self.gui_config.threhold
305
- )
306
- for i in range(db_threhold.shape[0]):
307
- if db_threhold[i]:
308
- indata[i * self.zc : (i + 1) * self.zc] = 0
309
- indata = indata[self.zc // 2 :]
310
- self.input_wav[: -self.block_frame] = self.input_wav[self.block_frame :].clone()
311
- self.input_wav[-indata.shape[0] :] = torch.from_numpy(indata).to(self.config.device)
312
- self.input_wav_res[: -self.block_frame_16k] = self.input_wav_res[self.block_frame_16k :].clone()
313
- # input noise reduction and resampling
314
- if self.gui_config.I_noise_reduce:
315
- self.input_wav_denoise[: -self.block_frame] = self.input_wav_denoise[self.block_frame :].clone()
316
- input_wav = self.input_wav[-self.sola_buffer_frame - self.block_frame :]
317
- input_wav = self.tg(input_wav.unsqueeze(0), self.input_wav.unsqueeze(0)).squeeze(0)
318
- input_wav[: self.sola_buffer_frame] *= self.fade_in_window
319
- input_wav[: self.sola_buffer_frame] += self.nr_buffer * self.fade_out_window
320
- self.input_wav_denoise[-self.block_frame :] = input_wav[: self.block_frame]
321
- self.nr_buffer[:] = input_wav[self.block_frame :]
322
- self.input_wav_res[-self.block_frame_16k - 160 :] = self.resampler(
323
- self.input_wav_denoise[-self.block_frame - 2 * self.zc :]
324
- )[160:]
325
- else:
326
- self.input_wav_res[-160 * (indata.shape[0] // self.zc + 1) :] = (
327
- self.resampler(self.input_wav[-indata.shape[0] - 2 * self.zc :])[160:]
328
- )
329
- # infer
330
- if self.function == "vc":
331
- infer_wav = self.rvc.infer(
332
- self.input_wav_res,
333
- self.block_frame_16k,
334
- self.skip_head,
335
- self.return_length,
336
- self.gui_config.f0method,
337
- )
338
- if self.resampler2 is not None:
339
- infer_wav = self.resampler2(infer_wav)
340
- elif self.gui_config.I_noise_reduce:
341
- infer_wav = self.input_wav_denoise[self.extra_frame :].clone()
342
- else:
343
- infer_wav = self.input_wav[self.extra_frame :].clone()
344
- # output noise reduction
345
- if self.gui_config.O_noise_reduce and self.function == "vc":
346
- self.output_buffer[: -self.block_frame] = self.output_buffer[self.block_frame :].clone()
347
- self.output_buffer[-self.block_frame :] = infer_wav[-self.block_frame :]
348
- infer_wav = self.tg(infer_wav.unsqueeze(0), self.output_buffer.unsqueeze(0)).squeeze(0)
349
- # volume envelop mixing
350
- if self.gui_config.rms_mix_rate < 1 and self.function == "vc":
351
- if self.gui_config.I_noise_reduce:
352
- input_wav = self.input_wav_denoise[self.extra_frame :]
353
- else:
354
- input_wav = self.input_wav[self.extra_frame :]
355
- rms1 = librosa.feature.rms(
356
- y=input_wav[: infer_wav.shape[0]].cpu().numpy(),
357
- frame_length=4 * self.zc,
358
- hop_length=self.zc,
359
- )
360
- rms1 = torch.from_numpy(rms1).to(self.config.device)
361
- rms1 = F.interpolate(
362
- rms1.unsqueeze(0),
363
- size=infer_wav.shape[0] + 1,
364
- mode="linear",
365
- align_corners=True,
366
- )[0, 0, :-1]
367
- rms2 = librosa.feature.rms(
368
- y=infer_wav[:].cpu().numpy(),
369
- frame_length=4 * self.zc,
370
- hop_length=self.zc,
371
- )
372
- rms2 = torch.from_numpy(rms2).to(self.config.device)
373
- rms2 = F.interpolate(
374
- rms2.unsqueeze(0),
375
- size=infer_wav.shape[0] + 1,
376
- mode="linear",
377
- align_corners=True,
378
- )[0, 0, :-1]
379
- rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-3)
380
- infer_wav *= torch.pow(
381
- rms1 / rms2, torch.tensor(1 - self.gui_config.rms_mix_rate)
382
- )
383
- # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
384
- conv_input = infer_wav[None, None, : self.sola_buffer_frame + self.sola_search_frame]
385
- cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :])
386
- cor_den = torch.sqrt(
387
- F.conv1d(
388
- conv_input**2,
389
- torch.ones(1, 1, self.sola_buffer_frame, device=self.config.device),
390
- )
391
- + 1e-8
392
- )
393
- if sys.platform == "darwin":
394
- _, sola_offset = torch.max(cor_nom[0, 0] / cor_den[0, 0])
395
- sola_offset = sola_offset.item()
396
- else:
397
- sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
398
- logger.info(f"sola_offset = {sola_offset}")
399
- infer_wav = infer_wav[sola_offset:]
400
- if "privateuseone" in str(self.config.device) or not self.gui_config.use_pv:
401
- infer_wav[: self.sola_buffer_frame] *= self.fade_in_window
402
- infer_wav[: self.sola_buffer_frame] += self.sola_buffer * self.fade_out_window
403
- else:
404
- infer_wav[: self.sola_buffer_frame] = phase_vocoder(
405
- self.sola_buffer,
406
- infer_wav[: self.sola_buffer_frame],
407
- self.fade_out_window,
408
- self.fade_in_window,
409
- )
410
- self.sola_buffer[:] = infer_wav[
411
- self.block_frame : self.block_frame + self.sola_buffer_frame
412
- ]
413
- if sys.platform == "darwin":
414
- outdata[:] = infer_wav[: self.block_frame].cpu().numpy()[:, np.newaxis]
415
- else:
416
- outdata[:] = infer_wav[: self.block_frame].repeat(2, 1).t().cpu().numpy()
417
- total_time = time.perf_counter() - start_time
418
- logger.info(f"Infer time: {total_time:.2f}")
419
-
420
- def get_devices(self, update: bool = True):
421
- if update:
422
- sd._terminate()
423
- sd._initialize()
424
- devices = sd.query_devices()
425
- hostapis = sd.query_hostapis()
426
- for hostapi in hostapis:
427
- for device_idx in hostapi["devices"]:
428
- devices[device_idx]["hostapi_name"] = hostapi["name"]
429
- input_devices = [
430
- f"{d['name']} ({d['hostapi_name']})"
431
- for d in devices
432
- if d["max_input_channels"] > 0
433
- ]
434
- output_devices = [
435
- f"{d['name']} ({d['hostapi_name']})"
436
- for d in devices
437
- if d["max_output_channels"] > 0
438
- ]
439
- input_devices_indices = [
440
- d["index"] if "index" in d else d["name"]
441
- for d in devices
442
- if d["max_input_channels"] > 0
443
- ]
444
- output_devices_indices = [
445
- d["index"] if "index" in d else d["name"]
446
- for d in devices
447
- if d["max_output_channels"] > 0
448
- ]
449
- return (
450
- input_devices,
451
- output_devices,
452
- input_devices_indices,
453
- output_devices_indices,
454
- )
455
-
456
- def set_devices(self, input_device, output_device):
457
- (
458
- input_devices,
459
- output_devices,
460
- input_device_indices,
461
- output_device_indices,
462
- ) = self.get_devices()
463
- logger.debug(f"Available input devices: {input_devices}")
464
- logger.debug(f"Available output devices: {output_devices}")
465
- logger.debug(f"Selected input device: {input_device}")
466
- logger.debug(f"Selected output device: {output_device}")
467
-
468
- if input_device not in input_devices:
469
- logger.error(f"Input device '{input_device}' is not in the list of available devices")
470
- raise HTTPException(status_code=400, detail=f"Input device '{input_device}' is not available")
471
-
472
- if output_device not in output_devices:
473
- logger.error(f"Output device '{output_device}' is not in the list of available devices")
474
- raise HTTPException(status_code=400, detail=f"Output device '{output_device}' is not available")
475
-
476
- sd.default.device[0] = input_device_indices[input_devices.index(input_device)]
477
- sd.default.device[1] = output_device_indices[output_devices.index(output_device)]
478
- logger.info(f"Input device set to {sd.default.device[0]}: {input_device}")
479
- logger.info(f"Output device set to {sd.default.device[1]}: {output_device}")
480
-
481
- audio_api = AudioAPI()
482
-
483
- @app.get("/inputDevices", response_model=list)
484
- def get_input_devices():
485
- try:
486
- input_devices, _, _, _ = audio_api.get_devices()
487
- return input_devices
488
- except Exception as e:
489
- logger.error(f"Failed to get input devices: {e}")
490
- raise HTTPException(status_code=500, detail="Failed to get input devices")
491
-
492
- @app.get("/outputDevices", response_model=list)
493
- def get_output_devices():
494
- try:
495
- _, output_devices, _, _ = audio_api.get_devices()
496
- return output_devices
497
- except Exception as e:
498
- logger.error(f"Failed to get output devices: {e}")
499
- raise HTTPException(status_code=500, detail="Failed to get output devices")
500
-
501
- @app.post("/config")
502
- def configure_audio(config_data: ConfigData):
503
- try:
504
- logger.info(f"Configuring audio with data: {config_data}")
505
- if audio_api.set_values(config_data):
506
- settings = config_data.dict()
507
- settings["use_jit"] = False
508
- with open("configs/config.json", "w", encoding='utf-8') as j:
509
- json.dump(settings, j, ensure_ascii=False)
510
- logger.info("Configuration set successfully")
511
- return {"message": "Configuration set successfully"}
512
- except HTTPException as e:
513
- logger.error(f"Configuration error: {e.detail}")
514
- raise
515
- except Exception as e:
516
- logger.error(f"Configuration failed: {e}")
517
- raise HTTPException(status_code=400, detail=f"Configuration failed: {e}")
518
-
519
- @app.post("/start")
520
- def start_conversion():
521
- try:
522
- if not audio_api.flag_vc:
523
- audio_api.start_vc()
524
- return {"message": "Audio conversion started"}
525
- else:
526
- logger.warning("Audio conversion already running")
527
- raise HTTPException(status_code=400, detail="Audio conversion already running")
528
- except HTTPException as e:
529
- logger.error(f"Start conversion error: {e.detail}")
530
- raise
531
- except Exception as e:
532
- logger.error(f"Failed to start conversion: {e}")
533
- raise HTTPException(status_code=500, detail="Failed to start conversion: {e}")
534
-
535
- @app.post("/stop")
536
- def stop_conversion():
537
- try:
538
- if audio_api.flag_vc:
539
- audio_api.flag_vc = False
540
- global stream_latency
541
- stream_latency = -1
542
- return {"message": "Audio conversion stopped"}
543
- else:
544
- logger.warning("Audio conversion not running")
545
- raise HTTPException(status_code=400, detail="Audio conversion not running")
546
- except HTTPException as e:
547
- logger.error(f"Stop conversion error: {e.detail}")
548
- raise
549
- except Exception as e:
550
- logger.error(f"Failed to stop conversion: {e}")
551
- raise HTTPException(status_code=500, detail="Failed to stop conversion: {e}")
552
-
553
- if __name__ == "__main__":
554
- if sys.platform == "win32":
555
- freeze_support()
556
- load_dotenv()
557
- os.environ["OMP_NUM_THREADS"] = "4"
558
- if sys.platform == "darwin":
559
- os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
560
- from tools.torchgate import TorchGate
561
- import tools.rvc_for_realtime as rvc_for_realtime
562
- from configs.config import Config
563
- audio_api.config = Config()
564
- audio_api.initialize_queues()
565
- uvicorn.run(app, host="0.0.0.0", port=6242)