import os import sys import json import argparse import subprocess from functools import lru_cache from distutils.util import strtobool now_dir = os.getcwd() sys.path.append(now_dir) current_script_directory = os.path.dirname(os.path.realpath(__file__)) logs_path = os.path.join(current_script_directory, "logs") from rvc.lib.tools.prerequisites_download import prequisites_download_pipeline python = sys.executable # Get TTS Voices -> https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4 @lru_cache(maxsize=1) # Cache only one result since the file is static def load_voices_data(): with open( os.path.join("rvc", "lib", "tools", "tts_voices.json"), "r", encoding="utf-8" ) as file: return json.load(file) voices_data = load_voices_data() locales = list({voice["ShortName"] for voice in voices_data}) @lru_cache(maxsize=None) def import_voice_converter(): from rvc.infer.infer import VoiceConverter return VoiceConverter() @lru_cache(maxsize=1) def get_config(): from rvc.configs.config import Config return Config() # Infer def run_infer_script( pitch: int, filter_radius: int, index_rate: float, volume_envelope: int, protect: float, hop_length: int, f0_method: str, input_path: str, output_path: str, pth_path: str, index_path: str, split_audio: bool, f0_autotune: bool, f0_autotune_strength: float, clean_audio: bool, clean_strength: float, export_format: str, f0_file: str, embedder_model: str, embedder_model_custom: str = None, formant_shifting: bool = False, formant_qfrency: float = 1.0, formant_timbre: float = 1.0, post_process: bool = False, reverb: bool = False, pitch_shift: bool = False, limiter: bool = False, gain: bool = False, distortion: bool = False, chorus: bool = False, bitcrush: bool = False, clipping: bool = False, compressor: bool = False, delay: bool = False, reverb_room_size: float = 0.5, reverb_damping: float = 0.5, reverb_wet_gain: float = 0.5, reverb_dry_gain: float = 0.5, reverb_width: float = 0.5, reverb_freeze_mode: float = 0.5, pitch_shift_semitones: float = 0.0, limiter_threshold: float = -6, limiter_release_time: float = 0.01, gain_db: float = 0.0, distortion_gain: float = 25, chorus_rate: float = 1.0, chorus_depth: float = 0.25, chorus_center_delay: float = 7, chorus_feedback: float = 0.0, chorus_mix: float = 0.5, bitcrush_bit_depth: int = 8, clipping_threshold: float = -6, compressor_threshold: float = 0, compressor_ratio: float = 1, compressor_attack: float = 1.0, compressor_release: float = 100, delay_seconds: float = 0.5, delay_feedback: float = 0.0, delay_mix: float = 0.5, sid: int = 0, ): kwargs = { "audio_input_path": input_path, "audio_output_path": output_path, "model_path": pth_path, "index_path": index_path, "pitch": pitch, "filter_radius": filter_radius, "index_rate": index_rate, "volume_envelope": volume_envelope, "protect": protect, "hop_length": hop_length, "f0_method": f0_method, "pth_path": pth_path, "index_path": index_path, "split_audio": split_audio, "f0_autotune": f0_autotune, "f0_autotune_strength": f0_autotune_strength, "clean_audio": clean_audio, "clean_strength": clean_strength, "export_format": export_format, "f0_file": f0_file, "embedder_model": embedder_model, "embedder_model_custom": embedder_model_custom, "post_process": post_process, "formant_shifting": formant_shifting, "formant_qfrency": formant_qfrency, "formant_timbre": formant_timbre, "reverb": reverb, "pitch_shift": pitch_shift, "limiter": limiter, "gain": gain, "distortion": distortion, "chorus": chorus, "bitcrush": bitcrush, "clipping": clipping, "compressor": compressor, "delay": delay, "reverb_room_size": reverb_room_size, "reverb_damping": reverb_damping, "reverb_wet_level": reverb_wet_gain, "reverb_dry_level": reverb_dry_gain, "reverb_width": reverb_width, "reverb_freeze_mode": reverb_freeze_mode, "pitch_shift_semitones": pitch_shift_semitones, "limiter_threshold": limiter_threshold, "limiter_release": limiter_release_time, "gain_db": gain_db, "distortion_gain": distortion_gain, "chorus_rate": chorus_rate, "chorus_depth": chorus_depth, "chorus_delay": chorus_center_delay, "chorus_feedback": chorus_feedback, "chorus_mix": chorus_mix, "bitcrush_bit_depth": bitcrush_bit_depth, "clipping_threshold": clipping_threshold, "compressor_threshold": compressor_threshold, "compressor_ratio": compressor_ratio, "compressor_attack": compressor_attack, "compressor_release": compressor_release, "delay_seconds": delay_seconds, "delay_feedback": delay_feedback, "delay_mix": delay_mix, "sid": sid, } infer_pipeline = import_voice_converter() infer_pipeline.convert_audio( **kwargs, ) # Batch infer def run_batch_infer_script( pitch: int, filter_radius: int, index_rate: float, volume_envelope: int, protect: float, hop_length: int, f0_method: str, input_folder: str, output_folder: str, pth_path: str, index_path: str, split_audio: bool, f0_autotune: bool, f0_autotune_strength: float, clean_audio: bool, clean_strength: float, export_format: str, f0_file: str, embedder_model: str, embedder_model_custom: str = None, formant_shifting: bool = False, formant_qfrency: float = 1.0, formant_timbre: float = 1.0, post_process: bool = False, reverb: bool = False, pitch_shift: bool = False, limiter: bool = False, gain: bool = False, distortion: bool = False, chorus: bool = False, bitcrush: bool = False, clipping: bool = False, compressor: bool = False, delay: bool = False, reverb_room_size: float = 0.5, reverb_damping: float = 0.5, reverb_wet_gain: float = 0.5, reverb_dry_gain: float = 0.5, reverb_width: float = 0.5, reverb_freeze_mode: float = 0.5, pitch_shift_semitones: float = 0.0, limiter_threshold: float = -6, limiter_release_time: float = 0.01, gain_db: float = 0.0, distortion_gain: float = 25, chorus_rate: float = 1.0, chorus_depth: float = 0.25, chorus_center_delay: float = 7, chorus_feedback: float = 0.0, chorus_mix: float = 0.5, bitcrush_bit_depth: int = 8, clipping_threshold: float = -6, compressor_threshold: float = 0, compressor_ratio: float = 1, compressor_attack: float = 1.0, compressor_release: float = 100, delay_seconds: float = 0.5, delay_feedback: float = 0.0, delay_mix: float = 0.5, sid: int = 0, ): kwargs = { "audio_input_paths": input_folder, "audio_output_path": output_folder, "model_path": pth_path, "index_path": index_path, "pitch": pitch, "filter_radius": filter_radius, "index_rate": index_rate, "volume_envelope": volume_envelope, "protect": protect, "hop_length": hop_length, "f0_method": f0_method, "pth_path": pth_path, "index_path": index_path, "split_audio": split_audio, "f0_autotune": f0_autotune, "f0_autotune_strength": f0_autotune_strength, "clean_audio": clean_audio, "clean_strength": clean_strength, "export_format": export_format, "f0_file": f0_file, "embedder_model": embedder_model, "embedder_model_custom": embedder_model_custom, "post_process": post_process, "formant_shifting": formant_shifting, "formant_qfrency": formant_qfrency, "formant_timbre": formant_timbre, "reverb": reverb, "pitch_shift": pitch_shift, "limiter": limiter, "gain": gain, "distortion": distortion, "chorus": chorus, "bitcrush": bitcrush, "clipping": clipping, "compressor": compressor, "delay": delay, "reverb_room_size": reverb_room_size, "reverb_damping": reverb_damping, "reverb_wet_level": reverb_wet_gain, "reverb_dry_level": reverb_dry_gain, "reverb_width": reverb_width, "reverb_freeze_mode": reverb_freeze_mode, "pitch_shift_semitones": pitch_shift_semitones, "limiter_threshold": limiter_threshold, "limiter_release": limiter_release_time, "gain_db": gain_db, "distortion_gain": distortion_gain, "chorus_rate": chorus_rate, "chorus_depth": chorus_depth, "chorus_delay": chorus_center_delay, "chorus_feedback": chorus_feedback, "chorus_mix": chorus_mix, "bitcrush_bit_depth": bitcrush_bit_depth, "clipping_threshold": clipping_threshold, "compressor_threshold": compressor_threshold, "compressor_ratio": compressor_ratio, "compressor_attack": compressor_attack, "compressor_release": compressor_release, "delay_seconds": delay_seconds, "delay_feedback": delay_feedback, "delay_mix": delay_mix, "sid": sid, } infer_pipeline = import_voice_converter() infer_pipeline.convert_audio_batch( **kwargs, ) return f"Files from {input_folder} inferred successfully." # TTS def run_tts_script( tts_file: str, tts_text: str, tts_voice: str, tts_rate: int, pitch: int, filter_radius: int, index_rate: float, volume_envelope: int, protect: float, hop_length: int, f0_method: str, output_tts_path: str, output_rvc_path: str, pth_path: str, index_path: str, split_audio: bool, f0_autotune: bool, f0_autotune_strength: float, clean_audio: bool, clean_strength: float, export_format: str, f0_file: str, embedder_model: str, embedder_model_custom: str = None, sid: int = 0, ): tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py") if os.path.exists(output_tts_path): os.remove(output_tts_path) command_tts = [ *map( str, [ python, tts_script_path, tts_file, tts_text, tts_voice, tts_rate, output_tts_path, ], ), ] subprocess.run(command_tts) infer_pipeline = import_voice_converter() infer_pipeline.convert_audio( pitch=pitch, filter_radius=filter_radius, index_rate=index_rate, volume_envelope=volume_envelope, protect=protect, hop_length=hop_length, f0_method=f0_method, audio_input_path=output_tts_path, audio_output_path=output_rvc_path, model_path=pth_path, index_path=index_path, split_audio=split_audio, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, clean_audio=clean_audio, clean_strength=clean_strength, export_format=export_format, f0_file=f0_file, embedder_model=embedder_model, embedder_model_custom=embedder_model_custom, sid=sid, formant_shifting=None, formant_qfrency=None, formant_timbre=None, post_process=None, reverb=None, pitch_shift=None, limiter=None, gain=None, distortion=None, chorus=None, bitcrush=None, clipping=None, compressor=None, delay=None, sliders=None, ) # Prerequisites def run_prerequisites_script( models: bool, exe: bool, ): prequisites_download_pipeline( models, exe, ) return "Prerequisites installed successfully."