import os import requests import tarfile import zipfile import shutil from pathlib import Path from tqdm import tqdm import subprocess def download_file(url: str, target_path: str): """使用requests下载文件,支持进度条""" response = requests.get(url, stream=True) total_size = int(response.headers.get('content-length', 0)) with open(target_path, 'wb') as file, tqdm( desc="Downloading", total=total_size, unit='iB', unit_scale=True, unit_divisor=1024, ) as pbar: for data in response.iter_content(chunk_size=1024): size = file.write(data) pbar.update(size) def download_vctk(target_dir: str = "data/raw"): """下载VCTK数据集""" url = "https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip" target_dir = Path(target_dir) zip_path = target_dir / "vctk.zip" # 创建目标目录 os.makedirs(target_dir, exist_ok=True) # 下载数据集 if not zip_path.exists(): print("Downloading VCTK dataset...") download_file(url, str(zip_path)) # 解压数据集 if not (target_dir / "VCTK-Corpus").exists(): print("\nExtracting VCTK dataset...") with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(target_dir) # 整理文件结构 vctk_dir = target_dir / "VCTK-Corpus" / "wav48" for speaker_dir in tqdm(os.listdir(vctk_dir), desc="Organizing files"): if os.path.isdir(vctk_dir / speaker_dir): # 移动音频文件 src_dir = vctk_dir / speaker_dir dst_dir = target_dir / speaker_dir if not dst_dir.exists(): shutil.copytree(src_dir, dst_dir) # 清理下载文件 if zip_path.exists(): os.remove(zip_path) if (target_dir / "VCTK-Corpus").exists(): shutil.rmtree(target_dir / "VCTK-Corpus") def download_librispeech(target_dir: str = "data/raw", subset: str = "dev-clean"): """下载LibriSpeech数据集的一个子集""" url = f"https://www.openslr.org/resources/12/{subset}.tar.gz" target_dir = Path(target_dir) tar_path = target_dir / f"librispeech_{subset}.tar.gz" # 创建目标目录 os.makedirs(target_dir, exist_ok=True) # 下载数据集 if not tar_path.exists(): print(f"Downloading LibriSpeech {subset} dataset...") download_file(url, str(tar_path)) # 解压数据集 if not (target_dir / "LibriSpeech").exists(): print(f"\nExtracting LibriSpeech {subset} dataset...") with tarfile.open(tar_path, 'r:gz') as tar: tar.extractall(target_dir) # 整理文件结构 libri_dir = target_dir / "LibriSpeech" / subset for speaker_dir in tqdm(os.listdir(libri_dir), desc="Organizing files"): if os.path.isdir(libri_dir / speaker_dir): # 移动音频文件 src_dir = libri_dir / speaker_dir dst_dir = target_dir / f"libri_{speaker_dir}" if not dst_dir.exists(): shutil.copytree(src_dir, dst_dir) # 清理下载文件 if tar_path.exists(): os.remove(tar_path) if (target_dir / "LibriSpeech").exists(): shutil.rmtree(target_dir / "LibriSpeech") def download_aishell3(target_dir: str = "data/raw"): """下载AISHELL-3数据集(需要OpenSLR账号)""" print("AISHELL-3 dataset needs to be downloaded manually from:") print("https://www.openslr.org/93/") print(f"Please download and extract it to {target_dir}") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Download speech datasets") parser.add_argument("--dataset", type=str, choices=["vctk", "librispeech", "aishell3"], required=True, help="Dataset to download") parser.add_argument("--target_dir", type=str, default="data/raw", help="Directory to save the dataset") args = parser.parse_args() if args.dataset == "vctk": download_vctk(args.target_dir) elif args.dataset == "librispeech": download_librispeech(args.target_dir) else: download_aishell3(args.target_dir)