File size: 8,051 Bytes
e295ac3
df66f6e
e295ac3
df66f6e
2aeae43
df66f6e
9c5c692
b2fe6a1
e295ac3
7625ef6
2a5f9fb
e295ac3
2a5f9fb
7625ef6
e295ac3
 
 
 
 
2a5f9fb
06acefd
e295ac3
7625ef6
 
2a5f9fb
e295ac3
2a5f9fb
e295ac3
 
b2fe6a1
2a5f9fb
e295ac3
 
 
9c5c692
48719fa
9c5c692
e295ac3
9c5c692
 
 
 
 
 
 
 
b2fe6a1
2a5f9fb
e295ac3
ab44cd6
b2fe6a1
ab44cd6
9c5c692
48719fa
9c5c692
48719fa
 
 
 
9c5c692
 
 
e295ac3
9c5c692
 
 
 
 
 
 
 
 
 
 
 
e295ac3
9c5c692
 
b2fe6a1
 
 
 
9c5c692
 
 
 
 
 
e295ac3
c212cb7
2a5f9fb
b2fe6a1
2a5f9fb
c212cb7
9c5c692
2a5f9fb
e295ac3
9c5c692
 
b2fe6a1
48719fa
9c5c692
 
48719fa
9c5c692
 
e295ac3
 
b2fe6a1
e295ac3
9c5c692
 
 
 
 
 
 
 
 
 
 
b2fe6a1
 
 
 
9c5c692
 
 
 
 
 
 
 
 
 
b2fe6a1
9c5c692
 
 
 
 
 
 
b2fe6a1
48719fa
9c5c692
 
48719fa
9c5c692
 
 
 
b2fe6a1
9c5c692
b2fe6a1
 
 
48719fa
b2fe6a1
 
 
 
 
 
48719fa
b2fe6a1
 
 
 
 
e295ac3
2a5f9fb
e295ac3
 
c212cb7
2a5f9fb
 
 
c212cb7
b2fe6a1
c212cb7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import os

import pandas as pd
from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
from huggingface_hub.utils import HfHubHTTPError
from pandas import DataFrame
import numpy as np
import traceback

from src.display.utils import AutoEvalColumn, ModelType, NUMERIC_INTERVALS
from src.envs import H4_TOKEN, PATH_TO_COLLECTION

# Specific intervals for the collections
"""
intervals = {
    "1B": pd.Interval(0, 1.5, closed="right"),
    "3B": pd.Interval(2.5, 3.5, closed="neither"),
    "7B": pd.Interval(6, 8, closed="neither"),
    "13B": pd.Interval(10, 14, closed="neither"),
    "30B": pd.Interval(25, 35, closed="neither"),
    "65B": pd.Interval(60, 70, closed="neither"),
}
"""
intervals = {k:v for k,v in NUMERIC_INTERVALS.items() if "?" not in k}

def update_collections(df: DataFrame):
    """This function updates the Open LLM Leaderboard model collection with the latest best models for
    each size category and type.
    """
    print("Updating collections...")
    collection = get_collection(collection_slug=PATH_TO_COLLECTION, token=H4_TOKEN)
    params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")

    cur_best_models = []
    cur_best_scores = []
    cur_itens = []
    scores_per_type = {'pretrained': 0, 'other': 0, 'language': 0}

    types_to_consider = [('pretrained', [ModelType.PT]), ('other', [ModelType.LA, ModelType.FT, ModelType.chat])]

    for item in collection.items:
        try:
            delete_collection_item(
                collection_slug=PATH_TO_COLLECTION, item_object_id=item.item_object_id, token=H4_TOKEN
            )
        except HfHubHTTPError:
            traceback.print_exc()
            continue

    #filter quantized models
    #df = df[df[AutoEvalColumn.precision.name].isin(['bfloat16', 'float16', "?"])]
    
    ix = 0
    
    for size in intervals:
        interval_scores = []
        interval_itens_languages = []
        interval_itens = []
        
        numeric_interval = pd.IntervalIndex([intervals[size]])
        mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
        size_df = df.loc[mask]

        for model_type, types in types_to_consider:
            type_emojis = []
            for type in types:
                if type.value.name == "":
                    continue
                type_emoji = [t[0] for t in type.value.symbol]
                type_emojis.extend(type_emoji)
            filtered_df = size_df[size_df[AutoEvalColumn.model_type_symbol.name].isin(type_emojis)]
            filtered_df = filtered_df[filtered_df[AutoEvalColumn.average.name].astype(float) > scores_per_type[model_type]]

            best_models = filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)
            print(type_emojis, size, list(best_models[AutoEvalColumn.dummy.name])[:10])
            # We add them one by one to the leaderboard
            for i, row in best_models.iterrows():
                model = row[AutoEvalColumn.dummy.name]
                hf_path = row['hf_path']
                hf_path = hf_path if 'meta-llama/Meta-' not in hf_path else hf_path.replace("meta-llama/Meta-", "meta-llama/")
                if hf_path in cur_best_models:
                    continue
                score = row[AutoEvalColumn.average.name]
                language = row[AutoEvalColumn.main_language.name]
                if language == 'Portuguese':
                    note = f"Best Portuguese {type.to_str(' ')} model of around {size} on the leaderboard today! (Score: {score})"
                else:
                    note = f"Best {type.to_str(' ')} model of around {size} on the leaderboard today! (Score: {score})"
                try:
                    collection = add_collection_item(
                        PATH_TO_COLLECTION,
                        item_id=hf_path,
                        item_type="model",
                        exists_ok=True,
                        note=note,
                        token=H4_TOKEN,
                    )
                    ix += 1
                    item_object_id = collection.items[-1].item_object_id
                    cur_best_models.append(hf_path)
                    cur_best_scores.append(float(score))
                    interval_scores.append(float(score))
                    interval_itens_languages.append(language)
                    cur_itens.append(item_object_id)
                    interval_itens.append(item_object_id)
                    scores_per_type[model_type] = float(score)
                    break
                except HfHubHTTPError:
                    traceback.print_exc()
                    continue
        if 'Portuguese' not in interval_itens_languages:
            language = ['Portuguese']
            model_type = 'language'
            filtered_df = size_df[size_df[AutoEvalColumn.main_language.name].isin(language)]
            filtered_df = filtered_df[filtered_df[AutoEvalColumn.average.name].astype(float) > scores_per_type[model_type]]

            best_models = filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)
            print(language, size, list(best_models[AutoEvalColumn.dummy.name])[:10])
            # We add them one by one to the leaderboard
            for i, row in best_models.iterrows():
                model = row[AutoEvalColumn.dummy.name]
                hf_path = row['hf_path']
                hf_path = hf_path if 'meta-llama/Meta-' not in hf_path else hf_path.replace("meta-llama/Meta-", "meta-llama/")
                if hf_path in cur_best_models:
                    continue
                score = row[AutoEvalColumn.average.name]
                language = row[AutoEvalColumn.main_language.name]
                
                if language == 'Portuguese':
                    note = f"Best Portuguese {type.to_str(' ')} model of around {size} on the leaderboard today! (Score: {score})"
                else:
                    note = f"Best {type.to_str(' ')} model of around {size} on the leaderboard today! (Score: {score})"
                try:
                    collection = add_collection_item(
                        PATH_TO_COLLECTION,
                        item_id=hf_path,
                        item_type="model",
                        exists_ok=True,
                        note=note,
                        token=H4_TOKEN,
                    )
                    ix += 1
                    item_object_id = collection.items[-1].item_object_id
                    cur_best_models.append(hf_path)
                    cur_best_scores.append(float(score))
                    interval_scores.append(float(score))
                    interval_itens_languages.append(language)
                    cur_itens.append(item_object_id)
                    interval_itens.append(item_object_id)
                    scores_per_type[model_type] = float(score)
                    break
                except HfHubHTTPError:
                    traceback.print_exc()
                    continue
    # fix order:
    starting_idx = len(cur_best_models)
    k = 0
    for i in np.argsort(cur_best_scores):
        if i == k:
            continue
        else:
            try:
                #print(cur_best_models[i], interval_itens[i], starting_idx+k, interval_scores[i])
                update_collection_item(
                    collection_slug=PATH_TO_COLLECTION, item_object_id=cur_itens[i], position=starting_idx+k
                )
            except:
                traceback.print_exc()
                pass
        k += 1

    collection = get_collection(PATH_TO_COLLECTION, token=H4_TOKEN)
    for item in collection.items:
        if item.item_id not in cur_best_models:
            try:
                delete_collection_item(
                    collection_slug=PATH_TO_COLLECTION, item_object_id=item.item_object_id, token=H4_TOKEN
                )
            except HfHubHTTPError:
                traceback.print_exc()
                continue