File size: 12,952 Bytes
4738a88 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 |
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"VoiceCraft Inference Text To Speech Demo\n",
"===\n",
"This will install a ton of dependencies all over so consider using the provided docker container start-jupyter script to keep the cruft off your dev box.\n",
"\n",
"Run the next cells one at a time up until the *STOP* and follow those instructions before continuing. You only have to do this the first time to setup the container."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Only do the below if you are using docker"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# install OS deps\n",
"!sudo apt-get update && sudo apt-get install -y \\\n",
" git-core \\\n",
" ffmpeg \\\n",
" espeak-ng"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Update and setup Conda voicecraft environment\n",
"!conda update -y -n base -c conda-forge conda\n",
"!conda create -y -n voicecraft python=3.9.16 && \\\n",
" conda init bash"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# install conda and pip stuff in the activated conda above context\n",
"!echo -e \"Grab a cup a coffee and a slice of pizza...\\n\\n\"\n",
"\n",
"# make sure $HOME and $USER are setup so this will source the conda environment\n",
"!source ~/.bashrc && \\\n",
" conda activate voicecraft && \\\n",
" conda install -y -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068 && \\\n",
" pip install torch==2.0.1 && \\\n",
" pip install tensorboard==2.16.2 && \\\n",
" pip install phonemizer==3.2.1 && \\\n",
" pip install torchaudio==2.0.2 && \\\n",
" pip install datasets==2.16.0 && \\\n",
" pip install torchmetrics==0.11.1\n",
"\n",
"# do this one last otherwise you'll get an error about torch compiler missing due to xformer mismatch\n",
"!source ~/.bashrc && \\\n",
" conda activate voicecraft && \\\n",
" pip install -e git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# okay setup the conda environment such that jupyter notebook can find the kernel\n",
"!source ~/.bashrc && \\\n",
" conda activate voicecraft && \\\n",
" conda install -y -n voicecraft ipykernel --update-deps --force-reinstall\n",
"\n",
"# installs the Jupyter kernel into /home/myusername/.local/share/jupyter/kernels/voicecraft\n",
"!source ~/.bashrc && \\\n",
" conda activate voicecraft && \\\n",
" python3 -m ipykernel install --user --name=voicecraft"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# STOP\n",
"You have to do this part manually using the mouse/keyboard and the tabs at the top.\n",
"\n",
"* Refresh your browser to make sure it picks up the new kernel.\n",
"* Kernel -> Change Kernel -> Select Kernel -> voicecraft\n",
"* Kernel -> Restart Kernel -> Yes\n",
"\n",
"Now you can run the rest of the notebook and get an audio sample output. It will automatically download more models and such. The next time you use this container, you can just start below here as the dependencies will remain available until you delete the docker container."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Only do the above if you are using docker"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# import libs\n",
"# if this throws an error, something went wrong installing dependencies or changing the kernel above!\n",
"import os\n",
"os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n",
"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
"os.environ[\"USER\"] = \"YOUR_USERNAME\" # TODO change this to your username\n",
"\n",
"import torch\n",
"import torchaudio\n",
"import numpy as np\n",
"import random\n",
"\n",
"from data.tokenizer import (\n",
" AudioTokenizer,\n",
" TextTokenizer,\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# install MFA models and dictionaries if you haven't done so already\n",
"!source ~/.bashrc && \\\n",
" conda activate voicecraft && \\\n",
" mfa model download dictionary english_us_arpa && \\\n",
" mfa model download acoustic english_us_arpa"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# load model, encodec, and phn2num\n",
"# # load model, tokenizer, and other necessary files\n",
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"from models import voicecraft\n",
"#import models.voicecraft as voicecraft\n",
"voicecraft_name=\"giga830M.pth\" # or giga330M.pth\n",
"ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n",
"encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n",
"if not os.path.exists(ckpt_fn):\n",
" os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\\?download\\=true\")\n",
" os.system(f\"mv {voicecraft_name}\\?download\\=true ./pretrained_models/{voicecraft_name}\")\n",
"if not os.path.exists(encodec_fn):\n",
" os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n",
" os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n",
"\n",
"ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n",
"model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
"model.load_state_dict(ckpt[\"model\"])\n",
"model.to(device)\n",
"model.eval()\n",
"\n",
"phn2num = ckpt['phn2num']\n",
"\n",
"text_tokenizer = TextTokenizer(backend=\"espeak\")\n",
"audio_tokenizer = AudioTokenizer(signature=encodec_fn, device=device) # will also put the neural codec model on gpu\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Prepare your audio\n",
"# point to the original audio whose speech you want to clone\n",
"# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n",
"orig_audio = \"./demo/84_121550_000074_000000.wav\"\n",
"orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n",
"\n",
"# move the audio and transcript to temp folder\n",
"temp_folder = \"./demo/temp\"\n",
"os.makedirs(temp_folder, exist_ok=True)\n",
"os.system(f\"cp {orig_audio} {temp_folder}\")\n",
"filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n",
"with open(f\"{temp_folder}/{filename}.txt\", \"w\") as f:\n",
" f.write(orig_transcript)\n",
"# run MFA to get the alignment\n",
"align_temp = f\"{temp_folder}/mfa_alignments\"\n",
"!source ~/.bashrc && \\\n",
" conda activate voicecraft && \\\n",
" mfa align -v --clean -j 1 --output_format csv {temp_folder} \\\n",
" english_us_arpa english_us_arpa {align_temp}\n",
"\n",
"# # if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n",
"# !source ~/.bashrc && \\\n",
"# conda activate voicecraft && \\\n",
"# mfa align -v --clean -j 1 --output_format csv {temp_folder} \\\n",
"# english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n",
"cut_off_sec = 3.01 # NOTE: according to forced-alignment file demo/temp/mfa_alignments/84_121550_000074_000000.csv, the word \"common\" stop as 3.01 sec, this should be different for different audio\n",
"target_transcript = \"But when I had approached so near to them The common I cannot believe that the same model can also do text to speech synthesis as well!\"\n",
"# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec.\n",
"audio_fn = f\"{temp_folder}/{filename}.wav\"\n",
"info = torchaudio.info(audio_fn)\n",
"audio_dur = info.num_frames / info.sample_rate\n",
"\n",
"assert cut_off_sec < audio_dur, f\"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}\"\n",
"prompt_end_frame = int(cut_off_sec * info.sample_rate)\n",
"\n",
"# run the model to get the output\n",
"# hyperparameters for inference\n",
"codec_audio_sr = 16000\n",
"codec_sr = 50\n",
"top_k = 0\n",
"top_p = 0.8\n",
"temperature = 1\n",
"silence_tokens=[1388,1898,131]\n",
"kvcache = 1 # NOTE if OOM, change this to 0, or try the 330M model\n",
"\n",
"# NOTE adjust the below three arguments if the generation is not as good\n",
"stop_repetition = 3 # NOTE if the model generate long silence, reduce the stop_repetition to 3, 2 or even 1\n",
"sample_batch_size = 4 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 5 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n",
"seed = 1 # change seed if you are still unhappy with the result\n",
"\n",
"def seed_everything(seed):\n",
" os.environ['PYTHONHASHSEED'] = str(seed)\n",
" random.seed(seed)\n",
" np.random.seed(seed)\n",
" torch.manual_seed(seed)\n",
" torch.cuda.manual_seed(seed)\n",
" torch.backends.cudnn.benchmark = False\n",
" torch.backends.cudnn.deterministic = True\n",
"seed_everything(seed)\n",
"\n",
"decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens, \"sample_batch_size\": sample_batch_size}\n",
"from inference_tts_scale import inference_one_sample\n",
"concated_audio, gen_audio = inference_one_sample(model, ckpt[\"config\"], phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame)\n",
" \n",
"# save segments for comparison\n",
"concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()\n",
"# logging.info(f\"length of the resynthesize orig audio: {orig_audio.shape}\")\n",
"\n",
"\n",
"# display the audio\n",
"from IPython.display import Audio\n",
"print(\"concatenate prompt and generated:\")\n",
"display(Audio(concated_audio, rate=codec_audio_sr))\n",
"\n",
"print(\"generated:\")\n",
"display(Audio(gen_audio, rate=codec_audio_sr))\n",
"\n",
"# # save the audio\n",
"# # output_dir\n",
"# output_dir = \"/home/pyp/VoiceCraft/demo/generated_tts\"\n",
"# os.makedirs(output_dir, exist_ok=True)\n",
"# seg_save_fn_gen = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav\"\n",
"# seg_save_fn_concat = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav\" \n",
"\n",
"# torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr)\n",
"# torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr)\n",
"\n",
"# if you get error importing T5 in transformers\n",
"# try \n",
"# pip uninstall Pillow\n",
"# pip install Pillow\n",
"# you are might get warnings like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "voicecraft",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
|