File size: 5,814 Bytes
07b0c58
fdd5df0
fe8a99c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fdd5df0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
<!DOCTYPE html>
<html>
<head>
    <script src="distill.bundle.js" type="module" fetchpriority="high" blocking></script>
    <script src="main.bundle.js" type="module" fetchpriority="low" defer></script>
    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
    <meta charset="utf8">
    <base target="_blank">
    <title>Scaling FineWeb to 1000+ languages: Step 1: finding signal in 100s of evaluation tasks</title>
    <link rel="stylesheet" href="style.css">
</head>

<body>
<d-front-matter>
    <script id='distill-front-matter' type="text/json">{
    "title": "📝 Scaling FineWeb to 1000+ languages: Step 1: finding signal in 100s of evaluation tasks",
    "description": "This blog covers a discussion on multilingual evaluation and task signal, the processes for selecting existing evaluation tasks based on signal resulting in FineTasks, and comparisson of open and closed sourced on the FineTasks.",
    "published": "Oct 23, 2024",
    "affiliation": {"name": "HuggingFace"},
    "authors": [
      {
        "author":"Hynek Kydlíček",
        "authorURL":"https://huggingface.co/hynky"
      },
      {
        "author":"Guilherme Penedo",
        "authorURL":"https://huggingface.co/guipenedo"
      },
      {
        "author":"Clémentine Fourier",
        "authorURL":"https://huggingface.co/clefourrier"
      },
      {
        "author":"Nathan Habib",
        "authorURL":"https://huggingface.co/SaylorTwift"
      },
      {
        "author":"Thomas Wolf",
        "authorURL":"https://huggingface.co/thomwolf"
      }
    ]
    }</script>
</d-front-matter>

<d-byline></d-byline>
<d-article>
    <d-contents>
    </d-contents>
    <p>We're looking forward to revisiting this analysis in the future, not with just 9 languages, but at least 50—thanks to community contributions! Let's level the playing field between English and other languages together! 🤗</p>
    <d-math> 1+1=2 </d-math>
</d-article>

<d-appendix>
    <d-bibliography src="bibliography.bib"></d-bibliography>
    <style>
      d-appendix .citation {
        font-size: 11px;
        line-height: 15px;
        border-left: 1px solid rgba(0, 0, 0, 0.1);
        padding-left: 18px;
        border: 1px solid rgba(0,0,0,0.1);
        background: rgba(0, 0, 0, 0.02);
        padding: 10px 18px;
        border-radius: 3px;
        color: rgba(150, 150, 150, 1);
        overflow: hidden;
        margin-top: -12px;
        white-space: pre-wrap;
        word-wrap: break-word;
      }
    </style>

    <h3 id="citation">Citation</h3>
    <p>For attribution in academic contexts, please cite this work as</p>
    <pre class="citation short">Kydlicek, et al., "FineTasks: Finding signal in a haystack of 200+ multilingual tasks", 2024.</pre>
    <p>BibTeX citation</p>
    <pre class="citation long">@misc{kydlicek2024finetasksmultilingualtasks,
      title={FineTasks: Finding signal in a haystack of 200+ multilingual tasks},
      author={Hynek Kydlíček and Guilherme Penedo and Clémentine Fourier and Nathan Habib and Thomas Wolf},
      url={https://huggingface.co/spaces/HuggingFaceFW/blogpost-fine-tasks},
}</pre>
</d-appendix>

<script>
    const article = document.querySelector('d-article');
    const toc = document.querySelector('d-contents');
    if (toc) {
        const headings = article.querySelectorAll('h2, h3, h4');
        let ToC = `<nav role="navigation" class="l-text figcaption"><h3>Table of contents</h3>`;
        let prevLevel = 0;

        for (const el of headings) {
            // should element be included in TOC?
            const isInTitle = el.parentElement.tagName == 'D-TITLE';
            const isException = el.getAttribute('no-toc');
            if (isInTitle || isException) continue;
            el.setAttribute('id', el.textContent.toLowerCase().replaceAll(" ", "_"))
            const link = '<a target="_self" href="' + '#' + el.getAttribute('id') + '">' + el.textContent + '</a>';

            const level = el.tagName === 'H2' ? 0 : (el.tagName === 'H3' ? 1 : 2);
            while (prevLevel < level) {
                ToC += '<ul>'
                prevLevel++;
            }
            while (prevLevel > level) {
                ToC += '</ul>'
                prevLevel--;
            }
            if (level === 0)
                ToC += '<div>' + link + '</div>';
            else
                ToC += '<li>' + link + '</li>';
        }

        while (prevLevel > 0) {
            ToC += '</ul>'
            prevLevel--;
        }
        ToC += '</nav>';
        toc.innerHTML = ToC;
        toc.setAttribute('prerendered', 'true');
        const toc_links = document.querySelectorAll('d-contents > nav a');

        window.addEventListener('scroll', (_event) => {
            if (typeof (headings) != 'undefined' && headings != null && typeof (toc_links) != 'undefined' && toc_links != null) {
                // Then iterate forwards, on the first match highlight it and break
                find_active: {
                    for (let i = headings.length - 1; i >= 0; i--) {
                        if (headings[i].getBoundingClientRect().top - 50 <= 0) {
                            if (!toc_links[i].classList.contains("active")) {
                                toc_links.forEach((link, _index) => {
                                    link.classList.remove("active");
                                });
                                toc_links[i].classList.add('active');
                            }
                            break find_active;
                        }
                    }
                    toc_links.forEach((link, _index) => {
                        link.classList.remove("active");
                    });
                }
            }
        });
    }
</script>
</body>
</html>