File size: 3,447 Bytes
2b37c27
62f6e75
 
2b37c27
62f6e75
 
 
 
 
2b37c27
37332b5
43a274e
66a8b81
62f6e75
2b37c27
 
 
43a274e
 
 
 
 
 
 
 
2b37c27
62f6e75
 
 
2b37c27
62f6e75
 
 
 
2b37c27
 
62f6e75
2b37c27
 
 
 
 
 
 
 
62f6e75
 
 
 
 
 
 
61649cd
b4bbc22
43a274e
b4bbc22
2b37c27
62f6e75
2b37c27
62f6e75
2b37c27
62f6e75
2b37c27
62f6e75
2b37c27
62f6e75
 
7bc75e4
62f6e75
01ec97d
62f6e75
2b37c27
62f6e75
 
 
2b37c27
 
 
55c91da
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import time

import gradio as gr
import soundfile
import torch

import infer_tool

convert_cnt = [0]
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "1121_epochs.pth"
config_name = "config.json"
net_g_ms, hubert_soft, feature_input, hps_ms = infer_tool.load_model(f"{model_name}", f"{config_name}")

# 获取config参数
target_sample = hps_ms.data.sampling_rate
spk_dict = {
    "鸢一折纸": 0,
    "时崎狂三": 1,
    "冰芽川四糸乃": 2,
    "五河琴里": 3,
    "八舞夕弦": 4,
    "八舞耶俱矢": 5,
    "诱宵美九": 6,
    "夜刀神十香": 7
}


def vc_fn(sid, audio_record, audio_upload, tran):
    print(sid, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    if audio_upload is not None:
        audio_path = audio_upload
    elif audio_record is not None:
        audio_path = audio_record

    audio, sampling_rate = infer_tool.format_wav(audio_path, target_sample)
    duration = audio.shape[0] / sampling_rate

    o_audio, out_sr = infer_tool.infer(audio_path, spk_dict[sid], tran, net_g_ms, hubert_soft, feature_input)
    out_path = f"./out_temp.wav"
    soundfile.write(out_path, o_audio, target_sample)
    infer_tool.f0_plt(audio_path, out_path, tran, hubert_soft, feature_input)
    mistake, var = infer_tool.calc_error(audio_path, out_path, tran, feature_input)
    return f"分段误差参考:0.3优秀,0.5左右合理,少量0.8-1可以接受\n若偏差过大,请调整升降半音数;多次调整均过大、说明超出歌手音域\n半音偏差:{mistake}\n半音方差:{var}", (
        target_sample, o_audio), gr.Image.update("temp.jpg")


app = gr.Blocks()
with app:
    with gr.Tabs():
        with gr.TabItem("Basic"):
            gr.Markdown(value="""
            源码参考: [xiaolang/sovits_f0](https://huggingface.co/spaces/xiaolang/sovits_f0/tree/main)
            
            **音频格式为wav**
            
            转换效果取决于源音频语气、节奏是否与目标音色相近。
            
            源音频为女声时,**建议降3-6key**,**最后的输出误差越接近0,音准越高**
            
            源音频为**低音男声**时,**建议升3key,具体看曲线图情况**
            
            f0曲线可以直观的显示跑调情况,蓝色为输入音高,橙色为合成音频的音高
            
            若**只看见橙色**,说明蓝色曲线被覆盖,转换效果较好
            
            """)
            speaker_id = gr.Dropdown(label="音色", choices=list(spk_dict.keys()), value=list(spk_dict.values()))
            record_input = gr.Audio(source="microphone", label="录制你的声音", type="filepath", elem_id="audio_inputs")
            upload_input = gr.Audio(source="upload", label="上传音频(长度小于180秒)", type="filepath",
                                    elem_id="audio_inputs")
            vc_transform = gr.Number(label="升降半音(整数,可以正负,半音数量,升高八度就是12)", value=0)
            vc_submit = gr.Button("转换", variant="primary")
            out_message = gr.Textbox(label="Output Message")
            out_audio = gr.Audio(label="Output Audio")
            f0_image = gr.Image(label="f0曲线")
        vc_submit.click(vc_fn, [speaker_id, record_input, upload_input, vc_transform],
                        [out_message, out_audio, f0_image])
    app.launch(share=True)