neorvc / rvc /rvc_cli.py
NeoPy's picture
Upload folder using huggingface_hub
1c7d911 verified
import os
import sys
import json
import argparse
import subprocess
from functools import lru_cache
from distutils.util import strtobool
now_dir = os.getcwd()
sys.path.append(now_dir)
current_script_directory = os.path.dirname(os.path.realpath(__file__))
logs_path = os.path.join(current_script_directory, "logs")
from rvc.lib.tools.prerequisites_download import prequisites_download_pipeline
python = sys.executable
# Get TTS Voices -> https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4
@lru_cache(maxsize=1) # Cache only one result since the file is static
def load_voices_data():
with open(
os.path.join("rvc", "lib", "tools", "tts_voices.json"), "r", encoding="utf-8"
) as file:
return json.load(file)
voices_data = load_voices_data()
locales = list({voice["ShortName"] for voice in voices_data})
@lru_cache(maxsize=None)
def import_voice_converter():
from rvc.infer.infer import VoiceConverter
return VoiceConverter()
@lru_cache(maxsize=1)
def get_config():
from rvc.configs.config import Config
return Config()
# Infer
def run_infer_script(
pitch: int,
filter_radius: int,
index_rate: float,
volume_envelope: int,
protect: float,
hop_length: int,
f0_method: str,
input_path: str,
output_path: str,
pth_path: str,
index_path: str,
split_audio: bool,
f0_autotune: bool,
f0_autotune_strength: float,
clean_audio: bool,
clean_strength: float,
export_format: str,
f0_file: str,
embedder_model: str,
embedder_model_custom: str = None,
formant_shifting: bool = False,
formant_qfrency: float = 1.0,
formant_timbre: float = 1.0,
post_process: bool = False,
reverb: bool = False,
pitch_shift: bool = False,
limiter: bool = False,
gain: bool = False,
distortion: bool = False,
chorus: bool = False,
bitcrush: bool = False,
clipping: bool = False,
compressor: bool = False,
delay: bool = False,
reverb_room_size: float = 0.5,
reverb_damping: float = 0.5,
reverb_wet_gain: float = 0.5,
reverb_dry_gain: float = 0.5,
reverb_width: float = 0.5,
reverb_freeze_mode: float = 0.5,
pitch_shift_semitones: float = 0.0,
limiter_threshold: float = -6,
limiter_release_time: float = 0.01,
gain_db: float = 0.0,
distortion_gain: float = 25,
chorus_rate: float = 1.0,
chorus_depth: float = 0.25,
chorus_center_delay: float = 7,
chorus_feedback: float = 0.0,
chorus_mix: float = 0.5,
bitcrush_bit_depth: int = 8,
clipping_threshold: float = -6,
compressor_threshold: float = 0,
compressor_ratio: float = 1,
compressor_attack: float = 1.0,
compressor_release: float = 100,
delay_seconds: float = 0.5,
delay_feedback: float = 0.0,
delay_mix: float = 0.5,
sid: int = 0,
):
kwargs = {
"audio_input_path": input_path,
"audio_output_path": output_path,
"model_path": pth_path,
"index_path": index_path,
"pitch": pitch,
"filter_radius": filter_radius,
"index_rate": index_rate,
"volume_envelope": volume_envelope,
"protect": protect,
"hop_length": hop_length,
"f0_method": f0_method,
"pth_path": pth_path,
"index_path": index_path,
"split_audio": split_audio,
"f0_autotune": f0_autotune,
"f0_autotune_strength": f0_autotune_strength,
"clean_audio": clean_audio,
"clean_strength": clean_strength,
"export_format": export_format,
"f0_file": f0_file,
"embedder_model": embedder_model,
"embedder_model_custom": embedder_model_custom,
"post_process": post_process,
"formant_shifting": formant_shifting,
"formant_qfrency": formant_qfrency,
"formant_timbre": formant_timbre,
"reverb": reverb,
"pitch_shift": pitch_shift,
"limiter": limiter,
"gain": gain,
"distortion": distortion,
"chorus": chorus,
"bitcrush": bitcrush,
"clipping": clipping,
"compressor": compressor,
"delay": delay,
"reverb_room_size": reverb_room_size,
"reverb_damping": reverb_damping,
"reverb_wet_level": reverb_wet_gain,
"reverb_dry_level": reverb_dry_gain,
"reverb_width": reverb_width,
"reverb_freeze_mode": reverb_freeze_mode,
"pitch_shift_semitones": pitch_shift_semitones,
"limiter_threshold": limiter_threshold,
"limiter_release": limiter_release_time,
"gain_db": gain_db,
"distortion_gain": distortion_gain,
"chorus_rate": chorus_rate,
"chorus_depth": chorus_depth,
"chorus_delay": chorus_center_delay,
"chorus_feedback": chorus_feedback,
"chorus_mix": chorus_mix,
"bitcrush_bit_depth": bitcrush_bit_depth,
"clipping_threshold": clipping_threshold,
"compressor_threshold": compressor_threshold,
"compressor_ratio": compressor_ratio,
"compressor_attack": compressor_attack,
"compressor_release": compressor_release,
"delay_seconds": delay_seconds,
"delay_feedback": delay_feedback,
"delay_mix": delay_mix,
"sid": sid,
}
infer_pipeline = import_voice_converter()
infer_pipeline.convert_audio(
**kwargs,
)
# Batch infer
def run_batch_infer_script(
pitch: int,
filter_radius: int,
index_rate: float,
volume_envelope: int,
protect: float,
hop_length: int,
f0_method: str,
input_folder: str,
output_folder: str,
pth_path: str,
index_path: str,
split_audio: bool,
f0_autotune: bool,
f0_autotune_strength: float,
clean_audio: bool,
clean_strength: float,
export_format: str,
f0_file: str,
embedder_model: str,
embedder_model_custom: str = None,
formant_shifting: bool = False,
formant_qfrency: float = 1.0,
formant_timbre: float = 1.0,
post_process: bool = False,
reverb: bool = False,
pitch_shift: bool = False,
limiter: bool = False,
gain: bool = False,
distortion: bool = False,
chorus: bool = False,
bitcrush: bool = False,
clipping: bool = False,
compressor: bool = False,
delay: bool = False,
reverb_room_size: float = 0.5,
reverb_damping: float = 0.5,
reverb_wet_gain: float = 0.5,
reverb_dry_gain: float = 0.5,
reverb_width: float = 0.5,
reverb_freeze_mode: float = 0.5,
pitch_shift_semitones: float = 0.0,
limiter_threshold: float = -6,
limiter_release_time: float = 0.01,
gain_db: float = 0.0,
distortion_gain: float = 25,
chorus_rate: float = 1.0,
chorus_depth: float = 0.25,
chorus_center_delay: float = 7,
chorus_feedback: float = 0.0,
chorus_mix: float = 0.5,
bitcrush_bit_depth: int = 8,
clipping_threshold: float = -6,
compressor_threshold: float = 0,
compressor_ratio: float = 1,
compressor_attack: float = 1.0,
compressor_release: float = 100,
delay_seconds: float = 0.5,
delay_feedback: float = 0.0,
delay_mix: float = 0.5,
sid: int = 0,
):
kwargs = {
"audio_input_paths": input_folder,
"audio_output_path": output_folder,
"model_path": pth_path,
"index_path": index_path,
"pitch": pitch,
"filter_radius": filter_radius,
"index_rate": index_rate,
"volume_envelope": volume_envelope,
"protect": protect,
"hop_length": hop_length,
"f0_method": f0_method,
"pth_path": pth_path,
"index_path": index_path,
"split_audio": split_audio,
"f0_autotune": f0_autotune,
"f0_autotune_strength": f0_autotune_strength,
"clean_audio": clean_audio,
"clean_strength": clean_strength,
"export_format": export_format,
"f0_file": f0_file,
"embedder_model": embedder_model,
"embedder_model_custom": embedder_model_custom,
"post_process": post_process,
"formant_shifting": formant_shifting,
"formant_qfrency": formant_qfrency,
"formant_timbre": formant_timbre,
"reverb": reverb,
"pitch_shift": pitch_shift,
"limiter": limiter,
"gain": gain,
"distortion": distortion,
"chorus": chorus,
"bitcrush": bitcrush,
"clipping": clipping,
"compressor": compressor,
"delay": delay,
"reverb_room_size": reverb_room_size,
"reverb_damping": reverb_damping,
"reverb_wet_level": reverb_wet_gain,
"reverb_dry_level": reverb_dry_gain,
"reverb_width": reverb_width,
"reverb_freeze_mode": reverb_freeze_mode,
"pitch_shift_semitones": pitch_shift_semitones,
"limiter_threshold": limiter_threshold,
"limiter_release": limiter_release_time,
"gain_db": gain_db,
"distortion_gain": distortion_gain,
"chorus_rate": chorus_rate,
"chorus_depth": chorus_depth,
"chorus_delay": chorus_center_delay,
"chorus_feedback": chorus_feedback,
"chorus_mix": chorus_mix,
"bitcrush_bit_depth": bitcrush_bit_depth,
"clipping_threshold": clipping_threshold,
"compressor_threshold": compressor_threshold,
"compressor_ratio": compressor_ratio,
"compressor_attack": compressor_attack,
"compressor_release": compressor_release,
"delay_seconds": delay_seconds,
"delay_feedback": delay_feedback,
"delay_mix": delay_mix,
"sid": sid,
}
infer_pipeline = import_voice_converter()
infer_pipeline.convert_audio_batch(
**kwargs,
)
return f"Files from {input_folder} inferred successfully."
# TTS
def run_tts_script(
tts_file: str,
tts_text: str,
tts_voice: str,
tts_rate: int,
pitch: int,
filter_radius: int,
index_rate: float,
volume_envelope: int,
protect: float,
hop_length: int,
f0_method: str,
output_tts_path: str,
output_rvc_path: str,
pth_path: str,
index_path: str,
split_audio: bool,
f0_autotune: bool,
f0_autotune_strength: float,
clean_audio: bool,
clean_strength: float,
export_format: str,
f0_file: str,
embedder_model: str,
embedder_model_custom: str = None,
sid: int = 0,
):
tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py")
if os.path.exists(output_tts_path):
os.remove(output_tts_path)
command_tts = [
*map(
str,
[
python,
tts_script_path,
tts_file,
tts_text,
tts_voice,
tts_rate,
output_tts_path,
],
),
]
subprocess.run(command_tts)
infer_pipeline = import_voice_converter()
infer_pipeline.convert_audio(
pitch=pitch,
filter_radius=filter_radius,
index_rate=index_rate,
volume_envelope=volume_envelope,
protect=protect,
hop_length=hop_length,
f0_method=f0_method,
audio_input_path=output_tts_path,
audio_output_path=output_rvc_path,
model_path=pth_path,
index_path=index_path,
split_audio=split_audio,
f0_autotune=f0_autotune,
f0_autotune_strength=f0_autotune_strength,
clean_audio=clean_audio,
clean_strength=clean_strength,
export_format=export_format,
f0_file=f0_file,
embedder_model=embedder_model,
embedder_model_custom=embedder_model_custom,
sid=sid,
formant_shifting=None,
formant_qfrency=None,
formant_timbre=None,
post_process=None,
reverb=None,
pitch_shift=None,
limiter=None,
gain=None,
distortion=None,
chorus=None,
bitcrush=None,
clipping=None,
compressor=None,
delay=None,
sliders=None,
)
# Prerequisites
def run_prerequisites_script(
models: bool,
exe: bool,
):
prequisites_download_pipeline(
models,
exe,
)
return "Prerequisites installed successfully."