File size: 1,110 Bytes
da6e1bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
import re
from datetime import date
import pandas as pd
from joblib.memory import Memory
from langcodes import standardize_tag
from requests import get
cache = Memory(location=".cache", verbose=0).cache
# load CommonVoice stats
@cache # cache for 1 day
def get_commonvoice_stats(date: date):
return get("https://commonvoice.mozilla.org/api/v1/stats/languages").json()
commonvoice = pd.DataFrame(get_commonvoice_stats(date.today())).rename(
columns={"locale": "commonvoice_locale", "validatedHours": "commonvoice_hours"}
)[["commonvoice_locale", "commonvoice_hours"]]
# ignore country (language is language) (in practive this is only relevant to zh-CN/zh-TW/zh-HK)
commonvoice["bcp_47"] = commonvoice["commonvoice_locale"].apply(
lambda x: re.sub(r"-[A-Z]{2}$", "", x)
)
commonvoice["bcp_47"] = commonvoice["bcp_47"].apply(
lambda x: standardize_tag(x, macro=True)
) # this does not really seem to get macrolanguages though, e.g. not for Quechua
commonvoice = (
commonvoice.groupby("bcp_47")
.agg({"commonvoice_hours": "sum", "commonvoice_locale": "first"})
.reset_index()
)
|