File size: 2,234 Bytes
da6e1bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import pandas as pd
from langcodes import standardize_tag
from pathlib import Path
import tarfile
import requests

fleurs_tags = "af_za,am_et,ar_eg,as_in,ast_es,az_az,be_by,bg_bg,bn_in,bs_ba,ca_es,ceb_ph,ckb_iq,cmn_hans_cn,cs_cz,cy_gb,da_dk,de_de,el_gr,en_us,es_419,et_ee,fa_ir,ff_sn,fi_fi,fil_ph,fr_fr,ga_ie,gl_es,gu_in,ha_ng,he_il,hi_in,hr_hr,hu_hu,hy_am,id_id,ig_ng,is_is,it_it,ja_jp,jv_id,ka_ge,kam_ke,kea_cv,kk_kz,km_kh,kn_in,ko_kr,ky_kg,lb_lu,lg_ug,ln_cd,lo_la,lt_lt,luo_ke,lv_lv,mi_nz,mk_mk,ml_in,mn_mn,mr_in,ms_my,mt_mt,my_mm,nb_no,ne_np,nl_nl,nso_za,ny_mw,oc_fr,om_et,or_in,pa_in,pl_pl,ps_af,pt_br,ro_ro,ru_ru,sd_in,sk_sk,sl_si,sn_zw,so_so,sr_rs,sv_se,sw_ke,ta_in,te_in,tg_tj,th_th,tr_tr,uk_ua,umb_ao,ur_pk,uz_uz,vi_vn,wo_sn,xh_za,yo_ng,yue_hant_hk,zu_za"

fleurs = pd.DataFrame(fleurs_tags.split(","), columns=["fleurs_tag"])
fleurs["bcp_47"] = fleurs["fleurs_tag"].apply(
    lambda x: standardize_tag(x.rsplit("_")[0], macro=True)
)

def download_file(url, path):
    response = requests.get(url)
    with open(path, "wb") as f:
        f.write(response.content)


def download_fleurs(transcription_langs_eval):
    # the huggingface loader does not allow loading only the dev set, so do it manually
    for language in transcription_langs_eval.itertuples():
        tar_url = f"https://huggingface.co/datasets/google/fleurs/resolve/main/data/{language.fleurs_tag}/audio/dev.tar.gz"
        tar_path = Path(f"data/fleurs/{language.fleurs_tag}/audio/dev.tar.gz")
        audio_path = Path(f"data/fleurs/{language.fleurs_tag}/audio")
        if not audio_path.exists():
            print(f"Downloading {tar_url} to {tar_path}")
            tar_path.parent.mkdir(parents=True, exist_ok=True)
            download_file(tar_url, tar_path)
            with tarfile.open(tar_path, "r:gz") as tar:
                tar.extractall(path=audio_path)
        tsv_url = f"https://huggingface.co/datasets/google/fleurs/resolve/main/data/{language.fleurs_tag}/dev.tsv"
        tsv_path = Path(f"data/fleurs/{language.fleurs_tag}/dev.tsv")
        if not tsv_path.exists():
            print(f"Downloading {tsv_url} to {tsv_path}")
            tsv_path.parent.mkdir(parents=True, exist_ok=True)
            download_file(tsv_url, tsv_path)