import time import gradio as gr import soundfile import torch import infer_tool convert_cnt = [0] dev = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_name = "1121_epochs.pth" config_name = "config.json" net_g_ms, hubert_soft, feature_input, hps_ms = infer_tool.load_model(f"{model_name}", f"{config_name}") # 获取config参数 target_sample = hps_ms.data.sampling_rate spk_dict = { "鸢一折纸": 0, "时崎狂三": 1, "冰芽川四糸乃": 2, "五河琴里": 3, "八舞夕弦": 4, "八舞耶俱矢": 5, "诱宵美九": 6, "夜刀神十香": 7 } def vc_fn(sid, audio_record, audio_upload, tran): print(sid, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) if audio_upload is not None: audio_path = audio_upload elif audio_record is not None: audio_path = audio_record audio, sampling_rate = infer_tool.format_wav(audio_path, target_sample) duration = audio.shape[0] / sampling_rate o_audio, out_sr = infer_tool.infer(audio_path, spk_dict[sid], tran, net_g_ms, hubert_soft, feature_input) out_path = f"./out_temp.wav" soundfile.write(out_path, o_audio, target_sample) infer_tool.f0_plt(audio_path, out_path, tran, hubert_soft, feature_input) mistake, var = infer_tool.calc_error(audio_path, out_path, tran, feature_input) return f"分段误差参考:0.3优秀,0.5左右合理,少量0.8-1可以接受\n若偏差过大,请调整升降半音数;多次调整均过大、说明超出歌手音域\n半音偏差:{mistake}\n半音方差:{var}", ( target_sample, o_audio), gr.Image.update("temp.jpg") app = gr.Blocks() with app: with gr.Tabs(): with gr.TabItem("Basic"): gr.Markdown(value=""" 源码参考: [xiaolang/sovits_f0](https://huggingface.co/spaces/xiaolang/sovits_f0/tree/main) **音频格式为wav** 转换效果取决于源音频语气、节奏是否与目标音色相近。 源音频为女声时,**建议降3-6key**,**最后的输出误差越接近0,音准越高** 源音频为**低音男声**时,**建议升3key,具体看曲线图情况** f0曲线可以直观的显示跑调情况,蓝色为输入音高,橙色为合成音频的音高 若**只看见橙色**,说明蓝色曲线被覆盖,转换效果较好 """) speaker_id = gr.Dropdown(label="音色", choices=list(spk_dict.keys()), value=list(spk_dict.values())) record_input = gr.Audio(source="microphone", label="录制你的声音", type="filepath", elem_id="audio_inputs") upload_input = gr.Audio(source="upload", label="上传音频(长度小于180秒)", type="filepath", elem_id="audio_inputs") vc_transform = gr.Number(label="升降半音(整数,可以正负,半音数量,升高八度就是12)", value=0) vc_submit = gr.Button("转换", variant="primary") out_message = gr.Textbox(label="Output Message") out_audio = gr.Audio(label="Output Audio") f0_image = gr.Image(label="f0曲线") vc_submit.click(vc_fn, [speaker_id, record_input, upload_input, vc_transform], [out_message, out_audio, f0_image]) app.launch(share=True)