File size: 1,418 Bytes
723f963
 
566c57e
2c21cf7
723f963
 
566c57e
723f963
 
 
 
 
 
 
 
 
 
566c57e
723f963
 
 
92b2164
723f963
 
 
 
92b2164
2c21cf7
723f963
 
 
92b2164
 
 
 
 
723f963
 
 
 
2c21cf7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import re
from collections import defaultdict
from joblib.memory import Memory
import pandas as pd
from language_data.population_data import LANGUAGE_SPEAKING_POPULATION

cache = Memory(location=".cache", verbose=0).cache


def population(bcp_47):
    items = {
        re.sub(r"^[a-z]+-", "", lang): pop
        for lang, pop in LANGUAGE_SPEAKING_POPULATION.items()
        if re.match(rf"^{bcp_47}-[A-Z]{{2}}$", lang)
    }
    return items

@cache
def make_country_table(language_table):
    countries = defaultdict(list)
    for lang in language_table.itertuples():
        for country, speaker_pop in population(lang.bcp_47).items():
            countries[country].append(
                {
                    "name": lang.language_name,
                    "bcp_47": lang.bcp_47,
                    "population": speaker_pop,
                    "score": lang.average if not pd.isna(lang.average) else 0,
                }
            )
    for country, languages in countries.items():
        speaker_pop = sum(entry["population"] for entry in languages)
        score = (
            sum(entry["score"] * entry["population"] for entry in languages)
            / speaker_pop
        )
        countries[country] = {
            "score": score,
            "languages": languages,
        }
    countries = [{"iso2": country, **data} for country, data in countries.items()]
    return pd.DataFrame(countries)