Spaces:
Sleeping
Sleeping
import os | |
import requests | |
import tarfile | |
import zipfile | |
import shutil | |
from pathlib import Path | |
from tqdm import tqdm | |
import subprocess | |
def download_file(url: str, target_path: str): | |
"""使用requests下载文件,支持进度条""" | |
response = requests.get(url, stream=True) | |
total_size = int(response.headers.get('content-length', 0)) | |
with open(target_path, 'wb') as file, tqdm( | |
desc="Downloading", | |
total=total_size, | |
unit='iB', | |
unit_scale=True, | |
unit_divisor=1024, | |
) as pbar: | |
for data in response.iter_content(chunk_size=1024): | |
size = file.write(data) | |
pbar.update(size) | |
def download_vctk(target_dir: str = "data/raw"): | |
"""下载VCTK数据集""" | |
url = "https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip" | |
target_dir = Path(target_dir) | |
zip_path = target_dir / "vctk.zip" | |
# 创建目标目录 | |
os.makedirs(target_dir, exist_ok=True) | |
# 下载数据集 | |
if not zip_path.exists(): | |
print("Downloading VCTK dataset...") | |
download_file(url, str(zip_path)) | |
# 解压数据集 | |
if not (target_dir / "VCTK-Corpus").exists(): | |
print("\nExtracting VCTK dataset...") | |
with zipfile.ZipFile(zip_path, 'r') as zip_ref: | |
zip_ref.extractall(target_dir) | |
# 整理文件结构 | |
vctk_dir = target_dir / "VCTK-Corpus" / "wav48" | |
for speaker_dir in tqdm(os.listdir(vctk_dir), desc="Organizing files"): | |
if os.path.isdir(vctk_dir / speaker_dir): | |
# 移动音频文件 | |
src_dir = vctk_dir / speaker_dir | |
dst_dir = target_dir / speaker_dir | |
if not dst_dir.exists(): | |
shutil.copytree(src_dir, dst_dir) | |
# 清理下载文件 | |
if zip_path.exists(): | |
os.remove(zip_path) | |
if (target_dir / "VCTK-Corpus").exists(): | |
shutil.rmtree(target_dir / "VCTK-Corpus") | |
def download_librispeech(target_dir: str = "data/raw", subset: str = "dev-clean"): | |
"""下载LibriSpeech数据集的一个子集""" | |
url = f"https://www.openslr.org/resources/12/{subset}.tar.gz" | |
target_dir = Path(target_dir) | |
tar_path = target_dir / f"librispeech_{subset}.tar.gz" | |
# 创建目标目录 | |
os.makedirs(target_dir, exist_ok=True) | |
# 下载数据集 | |
if not tar_path.exists(): | |
print(f"Downloading LibriSpeech {subset} dataset...") | |
download_file(url, str(tar_path)) | |
# 解压数据集 | |
if not (target_dir / "LibriSpeech").exists(): | |
print(f"\nExtracting LibriSpeech {subset} dataset...") | |
with tarfile.open(tar_path, 'r:gz') as tar: | |
tar.extractall(target_dir) | |
# 整理文件结构 | |
libri_dir = target_dir / "LibriSpeech" / subset | |
for speaker_dir in tqdm(os.listdir(libri_dir), desc="Organizing files"): | |
if os.path.isdir(libri_dir / speaker_dir): | |
# 移动音频文件 | |
src_dir = libri_dir / speaker_dir | |
dst_dir = target_dir / f"libri_{speaker_dir}" | |
if not dst_dir.exists(): | |
shutil.copytree(src_dir, dst_dir) | |
# 清理下载文件 | |
if tar_path.exists(): | |
os.remove(tar_path) | |
if (target_dir / "LibriSpeech").exists(): | |
shutil.rmtree(target_dir / "LibriSpeech") | |
def download_aishell3(target_dir: str = "data/raw"): | |
"""下载AISHELL-3数据集(需要OpenSLR账号)""" | |
print("AISHELL-3 dataset needs to be downloaded manually from:") | |
print("https://www.openslr.org/93/") | |
print(f"Please download and extract it to {target_dir}") | |
if __name__ == "__main__": | |
import argparse | |
parser = argparse.ArgumentParser(description="Download speech datasets") | |
parser.add_argument("--dataset", type=str, choices=["vctk", "librispeech", "aishell3"], | |
required=True, help="Dataset to download") | |
parser.add_argument("--target_dir", type=str, default="data/raw", | |
help="Directory to save the dataset") | |
args = parser.parse_args() | |
if args.dataset == "vctk": | |
download_vctk(args.target_dir) | |
elif args.dataset == "librispeech": | |
download_librispeech(args.target_dir) | |
else: | |
download_aishell3(args.target_dir) |