File size: 12,218 Bytes
f529b7b 7c6a118 f529b7b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 |
# download_data.py
import requests
import tarfile
import zipfile
import io
import pandas as pd
from pathlib import Path
import sys
import huggingface_hub
from datasets import load_dataset, DatasetDict
# Import fleurs DataFrame directly from its source module
from datasets_.fleurs import fleurs
# --- Configuration ---
# Add project root to sys.path (still useful for potential future imports if needed)
project_root = Path(__file__).resolve().parent
if str(project_root) not in sys.path:
sys.path.append(str(project_root))
DATA_DIR = project_root / "data"
FLEURS_BASE_URL = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
FLEURS_TARGET_DIR = DATA_DIR / "fleurs"
FLORES_PLUS_HF_ID = "openlanguagedata/flores_plus"
FLORES_TARGET_DIR = DATA_DIR / "floresp-v2.0-rc.3" / "dev_parquet" # Note: Saving as parquet
GLOTTOLOG_URL = "https://cdstar.shh.mpg.de/bitstreams/EAEA0-B44E-8CEC-EA65-0/glottolog_languoid.zip" # Assumed direct link from https://glottolog.org/meta/downloads
GLOTTOLOG_TARGET_DIR = DATA_DIR / "glottolog_languoid.csv"
GLOTTOLOG_CSV_NAME = "languoid.csv"
SCRIPTCODES_URL = "https://www.unicode.org/iso15924/iso15924-codes.html" # This is HTML, need manual download or parsing
SCRIPTCODES_TARGET_FILE = DATA_DIR / "ScriptCodes.csv"
SPBLEU_SPM_URL = "https://tinyurl.com/flores200sacrebleuspm" # Assumed direct link
SPBLEU_TARGET_DIR = DATA_DIR / "spbleu"
SPBLEU_SPM_NAME = "flores200_sacrebleu_tokenizer_spm.model"
SPBLEU_DICT_URL = "https://dl.fbaipublicfiles.com/large_objects/nllb/models/spm_200/dictionary.txt"
SPBLEU_DICT_NAME = "dictionary.txt"
# --- Helper Functions ---
def download_file(url, path: Path):
"""Downloads a file from a URL to a local path."""
print(f"Downloading {url} to {path}...")
try:
response = requests.get(url, stream=True, timeout=60)
response.raise_for_status() # Raise an exception for bad status codes
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"Successfully downloaded {path.name}.")
except requests.exceptions.RequestException as e:
print(f"Error downloading {url}: {e}")
except Exception as e:
print(f"An error occurred while saving {path}: {e}")
def extract_tar_gz(tar_path: Path, extract_path: Path):
"""Extracts a .tar.gz file."""
print(f"Extracting {tar_path} to {extract_path}...")
try:
with tarfile.open(tar_path, "r:gz") as tar:
tar.extractall(path=extract_path)
print(f"Successfully extracted {tar_path.name}.")
# tar_path.unlink() # Optionally remove the archive after extraction
except tarfile.TarError as e:
print(f"Error extracting {tar_path}: {e}")
except Exception as e:
print(f"An unexpected error occurred during extraction: {e}")
def extract_zip(zip_content: bytes, extract_path: Path, target_filename: str):
"""Extracts a specific file from zip content in memory."""
print(f"Extracting {target_filename} from zip data to {extract_path}...")
try:
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
# Find the correct file within the zip structure
target_zip_path = None
for member in z.namelist():
if member.endswith(target_filename):
target_zip_path = member
break
if target_zip_path:
with z.open(target_zip_path) as source, open(extract_path / target_filename, "wb") as target:
target.write(source.read())
print(f"Successfully extracted {target_filename}.")
else:
print(f"Error: Could not find {target_filename} within the zip archive.")
except zipfile.BadZipFile:
print("Error: Downloaded file is not a valid zip archive.")
except Exception as e:
print(f"An error occurred during zip extraction: {e}")
# --- Download Functions ---
def download_fleurs_data():
"""Downloads Fleurs audio and text data."""
print("\n--- Downloading Fleurs Data ---")
FLEURS_TARGET_DIR.mkdir(parents=True, exist_ok=True)
# Use the fleurs_tag column from the imported DataFrame
fleurs_tags_list = fleurs['fleurs_tag'].tolist()
if not fleurs_tags_list:
print("No Fleurs tags found in imported fleurs DataFrame. Skipping Fleurs.")
return
print(f"Checking/Downloading Fleurs for {len(fleurs_tags_list)} languages...")
for lang_tag in fleurs_tags_list:
lang_dir = FLEURS_TARGET_DIR / lang_tag
audio_dir = lang_dir / "audio"
dev_tsv_path = lang_dir / "dev.tsv"
dev_audio_archive_path = audio_dir / "dev.tar.gz"
audio_extracted_marker = audio_dir / "dev" # Check if extraction likely happened
# Download TSV
if not dev_tsv_path.exists():
tsv_url = f"{FLEURS_BASE_URL}/{lang_tag}/dev.tsv"
download_file(tsv_url, dev_tsv_path)
else:
print(f"Found: {dev_tsv_path}")
# Download and Extract Audio
if not audio_extracted_marker.exists():
if not dev_audio_archive_path.exists():
tar_url = f"{FLEURS_BASE_URL}/{lang_tag}/audio/dev.tar.gz"
download_file(tar_url, dev_audio_archive_path)
if dev_audio_archive_path.exists():
extract_tar_gz(dev_audio_archive_path, audio_dir)
else:
print(f"Audio archive missing, cannot extract for {lang_tag}")
else:
print(f"Found extracted audio: {audio_extracted_marker}")
def download_flores_plus_data():
"""Downloads Flores+ data using Hugging Face datasets library."""
print("\n--- Downloading Flores+ Data (requires HF login & accepted terms) ---")
FLORES_TARGET_DIR.mkdir(parents=True, exist_ok=True)
try:
# Check login status first
token = huggingface_hub.HfFolder.get_token()
if not token:
print("Hugging Face token not found. Please log in using `huggingface-cli login`.")
print("You also need to accept the terms for 'openlanguagedata/flores_plus' on the HF website.")
return
print(f"Attempting to download '{FLORES_PLUS_HF_ID}' (dev split)...")
# Load only the 'dev' split
ds = load_dataset(FLORES_PLUS_HF_ID, split='dev', verification_mode='no_checks')
# Save as parquet files, potentially one per language if needed later
# For simplicity now, save the whole dev split as one parquet file
target_file = FLORES_TARGET_DIR / "dev_split.parquet"
print(f"Saving dev split to {target_file}...")
ds.to_parquet(target_file)
print("Flores+ dev split downloaded and saved as parquet.")
except huggingface_hub.utils.GatedRepoError:
print(f"Error: Access to '{FLORES_PLUS_HF_ID}' is gated.")
print("Please ensure you are logged in (`huggingface-cli login`) and have accepted the terms ")
print(f"on the dataset page: https://huggingface.co/datasets/{FLORES_PLUS_HF_ID}")
except Exception as e:
print(f"An error occurred downloading or saving Flores+: {e}")
def download_glottolog_data():
"""Downloads and extracts Glottolog languoid CSV."""
print("\n--- Downloading Glottolog Data ---")
target_csv = GLOTTOLOG_TARGET_DIR / GLOTTOLOG_CSV_NAME
if not target_csv.exists():
print(f"Downloading Glottolog zip from {GLOTTOLOG_URL}...")
try:
response = requests.get(GLOTTOLOG_URL, timeout=60)
response.raise_for_status()
GLOTTOLOG_TARGET_DIR.mkdir(parents=True, exist_ok=True)
extract_zip(response.content, GLOTTOLOG_TARGET_DIR, GLOTTOLOG_CSV_NAME)
except requests.exceptions.RequestException as e:
print(f"Error downloading Glottolog zip: {e}")
except Exception as e:
print(f"An error occurred processing Glottolog: {e}")
else:
print(f"Found: {target_csv}")
def download_scriptcodes_data():
"""Downloads ScriptCodes CSV."""
print("\n--- Downloading ScriptCodes Data ---")
# The URL points to an HTML page, not a direct CSV link.
# Manual download is likely required for ScriptCodes.csv.
print(f"Cannot automatically download from {SCRIPTCODES_URL}")
print(f"Please manually download the ISO 15924 codes list (often available as a .txt file)")
print("from the Unicode website or related sources and save it as:")
print(f"{SCRIPTCODES_TARGET_FILE}")
if SCRIPTCODES_TARGET_FILE.exists():
print(f"Note: File already exists at {SCRIPTCODES_TARGET_FILE}")
def download_spbleu_data():
"""Downloads the SPM model and dictionary for spbleu."""
print("\n--- Downloading spbleu SPM Model and Dictionary ---")
SPBLEU_TARGET_DIR.mkdir(parents=True, exist_ok=True)
# Download SPM Model
target_model_file = SPBLEU_TARGET_DIR / SPBLEU_SPM_NAME
if not target_model_file.exists():
print(f"Downloading SPM Model...")
download_file(SPBLEU_SPM_URL, target_model_file)
else:
print(f"Found: {target_model_file}")
# Download Dictionary
target_dict_file = SPBLEU_TARGET_DIR / SPBLEU_DICT_NAME
if not target_dict_file.exists():
print(f"Downloading Dictionary...")
download_file(SPBLEU_DICT_URL, target_dict_file)
else:
print(f"Found: {target_dict_file}")
# --- Conversion Function ---
def convert_flores_parquet_to_text():
"""Converts the downloaded Flores+ parquet dev split to text files."""
print("\n--- Converting Flores+ Parquet to Text Files ---")
parquet_file = FLORES_TARGET_DIR / "dev_split.parquet"
text_dir = project_root / "data" / "floresp-v2.0-rc.3" / "dev" # Original expected dir
if not parquet_file.exists():
print(f"Parquet file not found: {parquet_file}. Skipping conversion.")
return
try:
print(f"Reading parquet file: {parquet_file}")
df = pd.read_parquet(parquet_file)
print(f"Read {len(df)} rows from parquet.")
if not all(col in df.columns for col in ['iso_639_3', 'iso_15924', 'text']):
print("Error: Parquet file missing required columns (iso_639_3, iso_15924, text).")
return
text_dir.mkdir(parents=True, exist_ok=True)
print(f"Target directory for text files: {text_dir}")
# Group by language and script to create individual files
grouped = df.groupby(['iso_639_3', 'iso_15924'])
count = 0
for (lang, script), group in grouped:
target_filename = f"dev.{lang}_{script}"
target_path = text_dir / target_filename
print(f"Writing {len(group)} sentences to {target_path}...")
try:
with open(target_path, 'w', encoding='utf-8') as f:
for sentence in group['text']:
f.write(sentence + '\n')
count += 1
except Exception as e:
print(f"Error writing file {target_path}: {e}")
print(f"Successfully wrote {count} language/script files to {text_dir}.")
except ImportError:
print("Error: pandas or pyarrow might be missing. Cannot read parquet.")
print("Please install them: pip install pandas pyarrow")
except Exception as e:
print(f"An error occurred during parquet conversion: {e}")
# --- Main Execution ---
def main():
"""Runs all download functions and the conversion step."""
print("Starting data download process...")
DATA_DIR.mkdir(exist_ok=True)
download_flores_plus_data()
convert_flores_parquet_to_text()
#download_fleurs_data()
download_glottolog_data()
download_scriptcodes_data()
download_spbleu_data()
print("\nData download process finished.")
print("Please verify downloads and manually obtain ScriptCodes.csv if needed.")
print("Note: Flores+ was downloaded as parquet, which might require changes but has been processed as well")
print("in 'evals/datasets_/flores.py' to be read correctly.")
if __name__ == "__main__":
main() |