José Ángel González commited on
Commit
1cf89c1
·
1 Parent(s): c1cfe75

From internal repo to HF

Browse files
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Ivace Leaderboard
3
  emoji: 👀
4
  colorFrom: purple
5
  colorTo: gray
@@ -8,7 +8,7 @@ sdk_version: 1.41.1
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
- short_description: IVACE leaderboard
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: IberBench Leaderboard
3
  emoji: 👀
4
  colorFrom: purple
5
  colorTo: gray
 
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
+ short_description: IberBench leaderboard
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -2,39 +2,76 @@ import json
2
  import os
3
  import re
4
  import uuid
 
5
  from pathlib import Path
6
 
7
  import pandas as pd
8
  import streamlit as st
 
 
9
  from datasets import load_dataset
10
- from huggingface_hub import CommitScheduler
 
 
11
 
12
  from src.check_validity import validate_model
 
13
 
14
- # define page config
15
- st.set_page_config(page_title="IberBench Leaderboard", layout="wide")
 
 
16
 
17
- # setup scheduler to upload user requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  request_file = Path("user_request/") / f"data_{uuid.uuid4()}.json"
19
  request_folder = request_file.parent
20
 
 
21
 
22
- # columns = [
23
- # "eval_name",
24
- # "Model",
25
- # "Type",
26
- # "Average ⬆️",
27
- # "IFEval",
28
- # "MMLU-PRO",
29
- # "GPQA",
30
- # "MUSR",
31
- # "CO₂ cost (kg)",
32
- # ]
33
- # languages
34
- lang_list = ["Spain", "Portuguese", "English", "Spanish", "Costa Rica", "Mexico", "Peru", "Uruguay", "Basque", "Catalan", "Galician"]
35
-
36
- # column order
37
- model_columns = ["model_name", "url", "type"]
38
 
39
  scheduler = CommitScheduler(
40
  repo_id="iberbench/user-requests",
@@ -46,247 +83,607 @@ scheduler = CommitScheduler(
46
  every=10,
47
  )
48
 
49
-
50
  def log_submission(input_dict: dict) -> None:
51
- """
52
- Append input/outputs and user feedback to a JSON Lines file using a thread lock to avoid concurrent writes from different users.
53
- """
54
  with scheduler.lock:
55
  with request_file.open("a") as f:
56
  f.write(json.dumps(input_dict))
57
  f.write("\n")
58
 
59
-
60
- # def get_url(html_content: str) -> str:
61
- # match = re.search(r'href=["\'](https?://[^\s"\']+)', html_content)
62
- # if match:
63
- # url = match.group(1)
64
- # return url
65
- # else:
66
- # raise ValueError("Url not found in the link")
67
-
68
-
69
  def get_lang_columns(columns: list, lang: str):
70
- """Filter columns per language"""
71
  lang_norm = lang.lower().replace(" ", "_")
72
-
73
  return [col for col in columns if lang_norm in col]
74
 
75
-
76
  @st.cache_data
77
  def load_data(lang) -> pd.DataFrame:
78
  try:
79
- data = (
80
- load_dataset("iberbench/lm-eval-results", token=st.secrets["HF_TOKEN"])["train"]
81
- .to_pandas()
82
- )
83
- # filter lang columns
84
  task_columns = [col for col in data.columns if col not in model_columns]
85
  task_lang_columns = get_lang_columns(task_columns, lang)
 
86
  data = data[model_columns + task_lang_columns]
87
-
88
- # data["Model"] = data["Model"].apply(get_url)
89
- # data.sort_values(by="Average ⬆️", ascending=False, inplace=True)
90
- # data.reset_index(drop=True, inplace=True)
91
-
92
- # add column to apply filtering
93
- data["Active"] = False
94
-
95
  return data
96
  except FileNotFoundError:
97
- st.error("iberbench/lm-eval-results was not found in the hub")
98
  return pd.DataFrame()
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- # functions to create filter
102
- def active_data(lang) -> pd.DataFrame:
103
- """Change all records as active"""
104
- return st.session_state[f"leaderboard_data_{lang}"][
105
- st.session_state[f"leaderboard_data_{lang}"]["Active"] == True
106
- ].copy()
107
 
 
 
108
 
109
  def get_index(lang, row) -> pd.Series:
110
- """Get index of the row"""
111
  return active_data(lang).iloc[row].name
112
 
113
-
114
  def commit(lang) -> None:
115
- """Commit changes to the session state"""
116
  for row in st.session_state[f"edited_data_{lang}"]["edited_rows"]:
117
  row_index = get_index(lang, row)
118
- for key, value in st.session_state[f"edited_data_{lang}"][
119
- "edited_rows"
120
- ][row].items():
121
- st.session_state[f"leaderboard_data_{lang}"].at[
122
- row_index, key
123
- ] = value
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
 
126
- def create_search_per_language(lang: str, search_dict: dict):
127
- if not st.session_state[f"leaderboard_data_{lang}"].empty:
128
- search_dict[lang] = st.text_input(
129
- "Search for ...",
130
- key=f"search_input_{lang}",
131
- on_change=commit,
132
- kwargs={"lang": lang},
133
- )
134
- if search_dict[lang] == "":
135
- st.session_state[f"leaderboard_data_{lang}"].Active = True
 
 
136
  else:
137
- st.session_state[f"leaderboard_data_{lang}"].Active = False
138
- st.session_state[f"leaderboard_data_{lang}"].loc[
139
- st.session_state[f"leaderboard_data_{lang}"][
140
- "model_name"
141
- ].str.contains(search_dict[lang], case=False),
142
- "Active",
143
- ] = True
144
-
145
- # select columns to display
146
- task_columns = [col for col in st.session_state[f"leaderboard_data_{lang}"].columns if col not in model_columns]
147
- task_lang_columns = get_lang_columns(task_columns, lang)
148
- columns = model_columns + task_lang_columns
149
-
150
- edited_data = st.data_editor(
151
- active_data(lang),
152
- column_order=columns,
153
- key=f"edited_data_{lang}",
154
- hide_index=False,
155
- # column_config={"Model": st.column_config.LinkColumn("Model")},
156
- column_config={"url": st.column_config.LinkColumn("url")},
157
- )
158
- else:
159
- st.write("No data found to display on leaderboard.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
 
162
- # streamlit UI
163
- for lang in lang_list:
164
- # todo: load a different dataset per language of load different column per lang
165
- leaderboard_data = load_data(lang)
166
- if f"leaderboard_data_{lang}" not in st.session_state:
167
- st.session_state[f"leaderboard_data_{lang}"] = leaderboard_data
 
 
168
 
169
- tabs = st.tabs(["Leaderboard", "Submit model"])
170
- search_dict = {}
171
 
172
- with tabs[0]:
173
- # logo image
174
- cols_logo = st.columns(5, vertical_alignment="center")
175
- with cols_logo[2]:
176
- st.image("assets/images/hf-logo.png", use_container_width=True)
 
177
 
178
- # title
179
- st.markdown(
180
- """
181
- <div style="text-align: center;">
182
- <h1>IberBench LLM Leaderboard</h1>
183
- <p style="font-size: 1.2rem;">
184
- Comparing Large Language Models in an <span style="font-weight: 600;">open</span>
185
- and <span style="font-weight: 600;">reproducible</span> way
186
- </p>
187
- </div>
188
- """,
189
- unsafe_allow_html=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
- # create tabs per language
193
- lang_tabs = st.tabs(lang_list)
 
 
 
 
 
 
 
 
 
194
 
195
- for lang, lt in zip(lang_list, lang_tabs):
196
- with lt:
197
- create_search_per_language(lang, search_dict)
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
- with tabs[1]:
201
- st.header("Submit model")
202
 
203
- def get_id_number(id_val):
204
- html_template = f"""
205
- <div style="display: flex; align-items: flex-start; margin-bottom: 1rem;">
206
- <div style="
207
- width: 32px;
208
- height: 32px;
209
- border-radius: 50%;
210
- display: flex;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  align-items: center;
212
- justify-content: center;
213
- border: 1px solid #007BFF;
214
- color: #007BFF;
215
- font-size: 0.875rem;
216
- font-weight: 600;
217
- background-color: transparent;">
218
- {id_val}
219
- </div>"""
220
- return html_template
221
-
222
- # create guide info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  guide_info_list = []
224
  html_path = "assets/html"
225
- for filename in os.listdir(html_path):
 
226
  file_path = os.path.join(html_path, filename)
227
  with open(file_path, "r", encoding="utf-8") as file:
228
- guide_info_list.append(file.read())
229
-
230
- # display adding number id
231
- for i, info_div in enumerate(guide_info_list):
232
- st.markdown(get_id_number(i + 1) + info_div, unsafe_allow_html=True)
233
-
234
- with st.form("submit_model_form"):
 
 
 
 
 
 
 
 
 
 
235
  model_name = st.text_input(
236
- "Model Name (format: user_name/model_name)",
237
  help="Your model should be public on the Hub and follow the username/model-id format (e.g. mistralai/Mistral-7B-v0.1).",
238
  )
239
  description = st.text_area(
240
- "Description",
241
- help="Add a description of the proposed model for the evaluation to help prioritize its evaluation",
242
- )
243
- user_contact = st.text_input(
244
- "Your Contact Email",
245
- help="User e-mail to contact when there are updates",
246
  )
 
247
  precision_option = st.selectbox(
248
- "Choose precision format:",
249
- help="Size limits vary by precision: • FP16/BF16: up to 100B parameters • 8-bit: up to 280B parameters (2x) • 4-bit: up to 560B parameters (4x) Choose carefully as incorrect precision can cause evaluation errors.",
250
  options=["float16", "bfloat16", "8bit", "4bit", "GPTQ"],
251
  index=0,
252
  )
253
  weight_type_option = st.selectbox(
254
- "Select what type of weights are being loaded from the checkpoint provided:",
255
- help="Original: Complete model weights in safetensors format Delta: Weight differences from base model (requires base model for size calculation) Adapter: Lightweight fine-tuning layers (requires base model for size calculation)",
256
  options=["Original", "Adapter", "Delta"],
257
  index=0,
258
  )
259
- base_model_name = st.text_input(
260
- "Base model",
261
- help="Required for delta weights or adapters. This information is used to identify the original model and calculate the total parameter count by combining base model and adapter/delta parameters.",
262
- value="",
263
- )
264
  model_type = st.selectbox(
265
- "Choose model type:",
266
- help="🟢 Pretrained: Base models trained on text using masked modeling 🔶 Fine-tuned: Domain-specific optimization 💬 Chat: Models using RLHF, DPO, or IFT for conversation 🤝 Merge: Combined weights without additional training",
267
- options=[
268
- "🟢 Pretrained",
269
- "🔶 Fine-tuned",
270
- "💬 Chat",
271
- "🤝 Merge",
272
- ],
273
  )
274
- submit_button = st.form_submit_button("Submit Request")
275
-
276
  if submit_button:
277
- # validate model size, license, chat_templates
278
  use_chat_template = True if model_type == "💬 Chat" else False
279
  validation_error = validate_model(
280
- model_name,
281
- precision_option,
282
- base_model_name,
283
- weight_type_option,
284
- use_chat_template,
285
  )
286
  if validation_error is not None:
287
  st.error(validation_error)
288
  elif not re.match(r"[^@]+@[^@]+\.[^@]+", user_contact):
289
- st.error("Invalid email address.")
290
  else:
291
  input_dict = {
292
  "model_name": model_name,
@@ -299,8 +696,123 @@ with tabs[1]:
299
  }
300
  try:
301
  log_submission(input_dict)
302
- st.success("Your request has been sent successfully.")
303
  except Exception as e:
304
- st.error(
305
- f"Failed to send your request: {e}. Please try again later."
306
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import os
3
  import re
4
  import uuid
5
+ import random
6
  from pathlib import Path
7
 
8
  import pandas as pd
9
  import streamlit as st
10
+ import plotly.express as px
11
+
12
  from datasets import load_dataset
13
+ from huggingface_hub import CommitScheduler, hf_hub_download
14
+ from huggingface_hub.utils import RepositoryNotFoundError
15
+ from yaml import safe_load as yaml_load
16
 
17
  from src.check_validity import validate_model
18
+ from src.task_mappings import professional_mapping, semantic_categories
19
 
20
+ # -----------------------------------------------------------------------------
21
+ # Page configuration and global CSS styles for modern look and improved UX
22
+ # -----------------------------------------------------------------------------
23
+ st.set_page_config(page_title="IberBench", layout="wide", initial_sidebar_state="expanded", page_icon="🌍")
24
 
25
+ st.markdown(
26
+ """
27
+ <style>
28
+ /* General page styling */
29
+ body {
30
+ background-color: #f7f7f7;
31
+ font-family: 'Segoe UI', sans-serif;
32
+ }
33
+ /* Sidebar styling */
34
+ .css-1d391kg {
35
+ background-color: #ffffff;
36
+ border-right: 2px solid #eaeaea;
37
+ }
38
+ /* Header styling */
39
+ .main-header {
40
+ text-align: center;
41
+ padding: 2rem 0;
42
+ background: linear-gradient(90deg, #007BFF, #00BFFF);
43
+ color: white;
44
+ border-radius: 10px 10px 10px 10px;
45
+ }
46
+ /* Tab styling */
47
+ .stTabs > .css-1qimj2v {
48
+ background: #fff;
49
+ }
50
+ /* Form styling */
51
+ .stButton>button {
52
+ background-color: #007BFF;
53
+ color: white;
54
+ border: none;
55
+ border-radius: 5px;
56
+ }
57
+ </style>
58
+ """,
59
+ unsafe_allow_html=True,
60
+ )
61
+
62
+ # -----------------------------------------------------------------------------
63
+ # Global variables and helper functions
64
+ # -----------------------------------------------------------------------------
65
  request_file = Path("user_request/") / f"data_{uuid.uuid4()}.json"
66
  request_folder = request_file.parent
67
 
68
+ LANGUAGES_SETTINGS = Path("etc/languages_settings.yml")
69
 
70
+ dataset_columns = [
71
+ "workshop", "shared_task", "year", "task_type", "language",
72
+ "url", "language_variety", "problem_type", "num_labels", "labels",
73
+ ]
74
+ model_columns = ["model_name", "model_type", "num_parameters"]
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  scheduler = CommitScheduler(
77
  repo_id="iberbench/user-requests",
 
83
  every=10,
84
  )
85
 
 
86
  def log_submission(input_dict: dict) -> None:
 
 
 
87
  with scheduler.lock:
88
  with request_file.open("a") as f:
89
  f.write(json.dumps(input_dict))
90
  f.write("\n")
91
 
 
 
 
 
 
 
 
 
 
 
92
  def get_lang_columns(columns: list, lang: str):
 
93
  lang_norm = lang.lower().replace(" ", "_")
 
94
  return [col for col in columns if lang_norm in col]
95
 
 
96
  @st.cache_data
97
  def load_data(lang) -> pd.DataFrame:
98
  try:
99
+ data = load_dataset("iberbench/lm-eval-results", token=st.secrets["HF_TOKEN"])["train"].to_pandas()
 
 
 
 
100
  task_columns = [col for col in data.columns if col not in model_columns]
101
  task_lang_columns = get_lang_columns(task_columns, lang)
102
+ data[task_columns] = data[task_columns]*100
103
  data = data[model_columns + task_lang_columns]
104
+ #data["Active"] = False
 
 
 
 
 
 
 
105
  return data
106
  except FileNotFoundError:
107
+ st.error("iberbench/lm-eval-results was not found in the hub 😕")
108
  return pd.DataFrame()
109
 
110
+ def load_dataset_card(task) -> list:
111
+ name_repo = "iberbench/" + task
112
+ try:
113
+ info_path = hf_hub_download(
114
+ repo_id=name_repo,
115
+ filename="task_metadata.json",
116
+ repo_type="dataset",
117
+ )
118
+ with open(info_path, "r") as f:
119
+ info = json.load(f)
120
+ values_ = []
121
+ for i in dataset_columns:
122
+ if i in info:
123
+ values_.append(info[i])
124
+ else:
125
+ values_.append([] if i == "labels" else "-")
126
+ return values_
127
+ except RepositoryNotFoundError:
128
+ st.error(task + ": dataset was not found in the hub 🚫")
129
+ return ["-"] * len(dataset_columns)
130
 
 
 
 
 
 
 
131
 
132
+ def active_data(lang) -> pd.DataFrame:
133
+ return st.session_state[f"leaderboard_data_{lang}"][st.session_state[f"leaderboard_data_{lang}"]["Active"] == True].copy()
134
 
135
  def get_index(lang, row) -> pd.Series:
 
136
  return active_data(lang).iloc[row].name
137
 
 
138
  def commit(lang) -> None:
 
139
  for row in st.session_state[f"edited_data_{lang}"]["edited_rows"]:
140
  row_index = get_index(lang, row)
141
+ for key, value in st.session_state[f"edited_data_{lang}"]["edited_rows"][row].items():
142
+ st.session_state[f"leaderboard_data_{lang}"].at[row_index, key] = value
143
+
144
+
145
+ # -----------------------------------------------------------------------------
146
+ # Visualization helper functions
147
+ # -----------------------------------------------------------------------------
148
+ def create_table_results(df_mean: pd.DataFrame):
149
+ rank_value = []
150
+ for i in df_mean["Mean"].rank(method="dense", ascending=False).astype(int):
151
+ if i == 1:
152
+ rank_value.append(f"{i} 🥇")
153
+ elif i == 2:
154
+ rank_value.append(f"{i} 🥈")
155
+ elif i == 3:
156
+ rank_value.append(f"{i} 🥉")
157
+ else:
158
+ rank_value.append(str(i))
159
+ df_mean.insert(0, "Rank", rank_value)
160
+ df_final = df_mean.sort_values("Mean", ascending=False)
161
+ st.dataframe(
162
+ df_final,
163
+ hide_index=True,
164
+ use_container_width=True,
165
+ column_config={
166
+ "model_name": st.column_config.TextColumn("Model 🧠"),
167
+ "model_type": st.column_config.TextColumn("Type 📌"),
168
+ "num_parameters": st.column_config.NumberColumn("Model Size 🔢"),
169
+ },
170
+ )
171
 
172
 
173
+ def create_table_all_results(aggregated_df: pd.DataFrame):
174
+ combined_df = create_data_results_per_language()
175
+ df_lang= combined_df.pivot(index='model_name', columns='language', values='Mean')
176
+ aggregated_df[df_lang.columns]=df_lang[df_lang.columns].values
177
+ rank_value = []
178
+ for i in aggregated_df["Mean"].rank(method="dense", ascending=False).astype(int):
179
+ if i == 1:
180
+ rank_value.append(f"{i} 🥇")
181
+ elif i == 2:
182
+ rank_value.append(f"{i} 🥈")
183
+ elif i == 3:
184
+ rank_value.append(f"{i} 🥉")
185
  else:
186
+ rank_value.append(str(i))
187
+ aggregated_df.insert(0, "Rank", rank_value)
188
+ df_final = aggregated_df.sort_values("Mean", ascending=False)
189
+ st.dataframe(
190
+ df_final,
191
+ hide_index=True,
192
+ use_container_width=True,
193
+ column_config={
194
+ "model_name": st.column_config.TextColumn("Model 🧠"),
195
+ "model_type": st.column_config.TextColumn("Type 📌"),
196
+ "num_parameters": st.column_config.NumberColumn("Model Size 🔢"),
197
+ },
198
+ )
199
+
200
+
201
+ def create_scatter_chart(df: pd.DataFrame, id_: str):
202
+ fig = px.scatter(
203
+ df,
204
+ x="num_parameters",
205
+ y="Mean",
206
+ color="model_name",
207
+ size="num_parameters",
208
+ hover_data=["model_type"],
209
+ labels={"num_parameters": "Num parameters"}
210
+ )
211
+ fig.update_layout(template="plotly_white")
212
+ st.plotly_chart(fig, use_container_width=True, key=id_ + str(random.random()))
213
+
214
+ def create_radar_chart(df: pd.DataFrame, id_: str):
215
+ df = df.sort_values(by="Mean", ascending=False)
216
+ radar_df = pd.DataFrame({
217
+ "r": df["Mean"][:10],
218
+ "theta": df["model_name"][:10]
219
+ })
220
+ fig = px.line_polar(
221
+ radar_df, r="r", theta="theta", line_close=True, markers=True,
222
+ )
223
+ fig.update_traces(fill="toself")
224
+ st.plotly_chart(fig, use_container_width=True, key=id_ + str(random.random()))
225
 
226
 
227
+ def create_pie_chart(df: pd.DataFrame, id_: str):
228
+ df_pie = df["model_type"].value_counts().reset_index()
229
+ df_pie.columns = ["model_type", "count"]
230
+ fig = px.pie(
231
+ df_pie, values="count", names="model_type",
232
+ labels={"model_type": "Model type"}
233
+ )
234
+ st.plotly_chart(fig, use_container_width=True, key=id_ + str(random.random()))
235
 
 
 
236
 
237
+ def create_box_plot(df: pd.DataFrame, id_: str):
238
+ fig = px.box(
239
+ df, x="model_type", y="Mean", points="all",
240
+ labels={"model_type": "Model type"}
241
+ )
242
+ st.plotly_chart(fig, use_container_width=True, key=id_ + str(random.random()))
243
 
244
+
245
+ def get_summary_df(lang: str, task_types: list) -> pd.DataFrame:
246
+ df = st.session_state[f"leaderboard_data_{lang}"][model_columns].copy()
247
+ if not st.session_state[f"leaderboard_data_{lang}"].empty:
248
+ for t in task_types:
249
+ task_list = semantic_categories[t]
250
+ cols = [col for col in st.session_state[f"leaderboard_data_{lang}"].columns
251
+ if "iberbench/" + col in task_list]
252
+ if cols:
253
+ tmp = st.session_state[f"leaderboard_data_{lang}"][cols]
254
+ df[t] = tmp.mean(axis=1).round(2)
255
+ if df.shape[1] > 4:
256
+ df.insert(3, "Mean", df.iloc[:, 3:-1].mean(axis=1).round(2))
257
+ else:
258
+ df.insert(3, "Mean", df.iloc[:, 3].round(2))
259
+ return df
260
+
261
+
262
+
263
+ def get_all_languages_summary_df() -> pd.DataFrame:
264
+ """Combine leaderboard summary data from all languages using get_summary_df."""
265
+ combined_df = pd.DataFrame()
266
+ for key in st.session_state:
267
+ if key.startswith("leaderboard_data_"):
268
+ lang = key.split("leaderboard_data_")[1]
269
+ task_types = select_task_per_language(lang)
270
+ summary_df = get_summary_df(lang, task_types)
271
+ summary_df["language"] = lang
272
+ combined_df = pd.concat([combined_df, summary_df], ignore_index=True)
273
+ return combined_df
274
+
275
+
276
+ def create_results_visualization_lang(lang: str):
277
+ # ---------------------------
278
+ # In-language plots section
279
+ # ---------------------------
280
+ task_types = select_task_per_language(lang)
281
+ summary_df = get_summary_df(lang, task_types)
282
+ tasks_df = st.session_state[f"leaderboard_data_{lang}"].copy()
283
+ create_table_results(summary_df)
284
+ st.markdown("### Language plots 📊")
285
+ # Display the results table for the selected language
286
+
287
+ in_lang_tabs = st.tabs([
288
+ "Top 10 performance 🥇",
289
+ "Performance vs. size 📏",
290
+ "Performance per type 💡",
291
+ "Fundamental vs industry ⚖️",
292
+ "Performance per task category 📈",
293
+ ])
294
+ with in_lang_tabs[0]:
295
+ create_radar_chart(summary_df, lang + "in_radar")
296
+ with in_lang_tabs[1]:
297
+ create_scatter_chart(summary_df, lang + "in_scatter")
298
+ with in_lang_tabs[2]:
299
+ create_box_plot(summary_df, lang + "in_box")
300
+ with in_lang_tabs[3]:
301
+ create_box_plot_per_task_category(tasks_df, lang + "in_box_task_cat")
302
+ with in_lang_tabs[4]:
303
+ create_box_plot_per_semantic_category(tasks_df, lang + "in_box_sem_cat")
304
+
305
+ # -----------------------------------------------------------------------------
306
+ # Functions for other visualization sections
307
+ # -----------------------------------------------------------------------------
308
+
309
+ def select_task_per_language(lang: str):
310
+ types = []
311
+ for k, v in semantic_categories.items():
312
+ for vv in v:
313
+ task_name = vv.split("iberbench/")[1]
314
+ if task_name in list(st.session_state[f"leaderboard_data_{lang}"].columns):
315
+ if k not in types:
316
+ types.append(k)
317
+ return types
318
+
319
+ def create_dataset_info_per_language(lang: str):
320
+ all_values = []
321
+ if not st.session_state[f"leaderboard_data_{lang}"].empty:
322
+ cols = [col for col in st.session_state[f"leaderboard_data_{lang}"].columns if col not in model_columns]
323
+ if len(cols) > 1:
324
+ for task in cols[:-1]:
325
+ values = load_dataset_card(task)
326
+ all_values.append(values)
327
+ else:
328
+ values = load_dataset_card(cols[0])
329
+ all_values.append(values)
330
+ df = pd.DataFrame(all_values, columns=dataset_columns)
331
+ st.dataframe(
332
+ df,
333
+ column_config={
334
+ "workshop": st.column_config.TextColumn("Workshop 🏫", help="Workshop to belong to the shared task"),
335
+ "shared_task": st.column_config.TextColumn("Shared Task 📋", help="Shared Task name"),
336
+ "year": st.column_config.TextColumn("Year 📅", help="Year of the shared task"),
337
+ "task_type": st.column_config.TextColumn("Task Type 🔖", help="Shared Task type"),
338
+ "language": st.column_config.TextColumn("Language 🌐", help="Shared Task language"),
339
+ "url": st.column_config.ListColumn("Task URL 🔗", help="Shared Task url"),
340
+ "language_variety": st.column_config.TextColumn("Language Variety 🗣️", help="Shared Task language variety"),
341
+ "problem_type": st.column_config.TextColumn("Problem Type ❓", help="Shared Task problem type"),
342
+ "num_labels": st.column_config.NumberColumn("Number of Labels 🔢", help="Shared Task number of labels"),
343
+ "labels": st.column_config.ListColumn("Labels 🏷️", help="Shared Task labels"),
344
+ },
345
+ hide_index=True,
346
+ )
347
+ else:
348
+ st.write("No data found to display on leaderboard 😔.")
349
+
350
+ def create_box_plot_per_task_category(df: pd.DataFrame, id_: str):
351
+ # Compute average performance for each professional category (using professional_mapping).
352
+ melt_vars = []
353
+ for category, tasks in professional_mapping.items():
354
+ relevant_cols = [col for col in df.columns if "iberbench/" + col in tasks]
355
+ if relevant_cols:
356
+ df[category] = df[relevant_cols].mean(axis=1).round(2)
357
+ melt_vars.append(category)
358
+ melt_vars = list(set(melt_vars))
359
+ id_vars = model_columns.copy()
360
+ if "language" in df.columns:
361
+ id_vars.append("language")
362
+ df_melt = df.melt(id_vars=id_vars, value_vars=melt_vars, var_name="Task Category", value_name="Performance")
363
+ fig = px.box(
364
+ df_melt, x="Task Category", y="Performance", points="all",
365
+ labels={"Performance": "Performance (%)"}
366
  )
367
+ st.plotly_chart(fig, use_container_width=True, key=id_ + str(random.random()))
368
+
369
+ def create_box_plot_per_semantic_category(df: pd.DataFrame, id_: str):
370
+ # Compute average performance for each semantic category defined in semantic_categories.
371
+ melt_vars = []
372
+ for category, tasks in semantic_categories.items():
373
+ relevant_cols = [col for col in df.columns if "iberbench/" + col in tasks]
374
+ if relevant_cols:
375
+ df[category] = df[relevant_cols].mean(axis=1).round(2)
376
+ melt_vars.append(category)
377
+ melt_vars = list(set(melt_vars))
378
+ id_vars = model_columns.copy()
379
+ if "language" in df.columns:
380
+ id_vars.append("language")
381
+ df_melt = df.melt(id_vars=id_vars, value_vars=melt_vars, var_name="Task Category", value_name="Performance")
382
+ fig = px.box(
383
+ df_melt, x="Task Category", y="Performance", points="all",
384
+ labels={"Performance": "Performance (%)"}
385
+ )
386
+ st.plotly_chart(fig, use_container_width=True, key=id_ + str(random.random()))
387
+
388
+ def create_histogram(df: pd.DataFrame, id_: str):
389
+ fig = px.histogram(
390
+ df, x="num_parameters", nbins=20, labels={"num_parameters": "Num parameters", "count": "Count"},
391
+ )
392
+ fig.update_layout(template="plotly_white")
393
+ st.plotly_chart(fig, use_container_width=True, key=id_ + str(random.random()))
394
+
395
+
396
+ def create_data_results_per_language() -> pd.DataFrame:
397
+ # Create a combined dataframe from all leaderboard data in session_state.
398
+ combined_df = pd.DataFrame()
399
+ for key in st.session_state.keys():
400
+ if key.startswith("leaderboard_data_"):
401
+ temp_df = st.session_state[key].copy()
402
+ # If the "language" column is missing, use the key to assign a language name.
403
+ if "language" not in temp_df.columns:
404
+ lang = key.split("leaderboard_data_")[1]
405
+ temp_df["language"] = lang
406
+ combined_df = pd.concat([combined_df, temp_df], ignore_index=True)
407
+
408
+ if combined_df.empty:
409
+ st.warning("No data available for any language ⚠️.")
410
+ return
411
+
412
+ # Check if the "Mean" column exists. If not, compute it.
413
+ if "Mean" not in combined_df.columns:
414
+ # Define model metadata columns that should be excluded from the performance calculation.
415
+ model_columns = ["model_name", "model_type", "num_parameters"]
416
+ # Exclude metadata, language, and any non-numeric columns.
417
+ performance_cols = [
418
+ col for col in combined_df.columns
419
+ if col not in model_columns + ["language", "Active"]
420
+ and pd.api.types.is_numeric_dtype(combined_df[col])
421
+ ]
422
+ if performance_cols:
423
+ combined_df["Mean"] = combined_df[performance_cols].mean(axis=1).round(2)
424
+ else:
425
+ st.warning("No numeric task performance columns available to compute 'Mean' ⚠️.")
426
+ return
427
+ return combined_df
428
+
429
+ def create_box_plot_per_language(id_: str):
430
+ # Create a boxplot with performance (Mean) per language.
431
+ combined_df = create_data_results_per_language()
432
+ fig = px.box(
433
+ combined_df,
434
+ x="language",
435
+ y="Mean",
436
+ points="all",
437
+ labels={"language": "Language", "Mean": "Performance (%)"},
438
+ )
439
+ st.plotly_chart(fig, use_container_width=True, key=id_ + str(random.random()))
440
+
441
 
442
+ def get_all_languages_summary_df() -> pd.DataFrame:
443
+ """Combine leaderboard summary data from all languages using get_summary_df."""
444
+ combined_df = pd.DataFrame()
445
+ for key in st.session_state:
446
+ if key.startswith("leaderboard_data_"):
447
+ lang = key.split("leaderboard_data_")[1]
448
+ task_types = select_task_per_language(lang)
449
+ summary_df = get_summary_df(lang, task_types)
450
+ summary_df["language"] = lang
451
+ combined_df = pd.concat([combined_df, summary_df], ignore_index=True)
452
+ return combined_df
453
 
 
 
 
454
 
455
+ def get_all_languages_aggregated_summary_df() -> pd.DataFrame:
456
+ """
457
+ Aggregate the combined summary data by model_name to compute mean performance
458
+ across languages. Use this aggregated data for radar, scatter, pie, box, and histogram plots.
459
+ """
460
+ df = get_all_languages_summary_df()
461
+ agg_df = df.groupby("model_name", as_index=False).agg({
462
+ "model_type": "first", # choose an aggregation that makes sense
463
+ "num_parameters": "mean", # average model size across languages
464
+ "Mean": "mean", # average performance
465
+ })
466
+ agg_df['Mean']=agg_df['Mean'].round(2)
467
+ return agg_df
468
+
469
+ def get_all_languages_raw_df() -> pd.DataFrame:
470
+ """
471
+ Combine the raw leaderboard data from all languages.
472
+ This is used for plots (e.g., Fundamental vs Professional) that rely on the original task columns.
473
+ """
474
+ combined_df = pd.DataFrame()
475
+ for key in st.session_state:
476
+ if key.startswith("leaderboard_data_"):
477
+ lang = key.split("leaderboard_data_")[1]
478
+ temp_df = st.session_state[key].copy()
479
+ temp_df["language"] = lang
480
+ combined_df = pd.concat([combined_df, temp_df], ignore_index=True)
481
+ return combined_df
482
+
483
+
484
+ # -----------------------------------------------------------------------------
485
+ # Sidebar for Navigation and Global Settings
486
+ # -----------------------------------------------------------------------------
487
+ st.sidebar.markdown("<h2 style='text-align: center;'>IberBench 🌍</h2>", unsafe_allow_html=True)
488
+ menu = st.sidebar.radio("", ["Leaderboard 📊", "Submit Model 🚀", "Datasets 📚", "About ℹ️"])
489
+ st.sidebar.markdown("---")
490
+ st.sidebar.markdown(
491
+ """
492
+ <p style="font-size:0.9rem; text-align:center;">
493
+ A leaderboard of LLMs on languages from the Iberian Peninsula and Ibero-America
494
+ </p>
495
+ """,
496
+ unsafe_allow_html=True,
497
+ )
498
+
499
+ def load_languages_set():
500
+ with open(LANGUAGES_SETTINGS, "r") as f:
501
+ return yaml_load(f)
502
 
503
+ lang_set = load_languages_set()
 
504
 
505
+ for lang in lang_set.keys():
506
+ if lang == "Mixed":
507
+ data = load_data("Spanish")
508
+ else:
509
+ data = load_data(lang)
510
+ if f"leaderboard_data_{lang}" not in st.session_state:
511
+ st.session_state[f"leaderboard_data_{lang}"] = data
512
+
513
+ # -----------------------------------------------------------------------------
514
+ # Main Content based on Navigation
515
+ # -----------------------------------------------------------------------------
516
+ if menu == "Leaderboard 📊":
517
+ st.markdown("<div class='main-header'><h1>Leaderboard 📊</h1></div>", unsafe_allow_html=True)
518
+ lang_iber = [k for k, v in lang_set.items() if v["category"] == "Iberian Peninsula languages"]
519
+ st.markdown("### General ranking 🏆")
520
+
521
+ # ---------------------------
522
+ # All-language plots section
523
+ # ---------------------------
524
+ # Use aggregated data for plots where each model must appear once with averaged values.
525
+ aggregated_df = get_all_languages_aggregated_summary_df()
526
+ create_table_all_results(aggregated_df)
527
+ st.markdown("### General plots 📊")
528
+ # Use raw data for Fundamental vs Professional and Task Category plots.
529
+ raw_all_df = get_all_languages_raw_df()
530
+ all_lang_tabs = st.tabs([
531
+ "Top 10 performance 🥇",
532
+ "Performance vs. size 📏",
533
+ "Type distribution 🎨",
534
+ "Performance per type 💡",
535
+ "Distribution of sizes 📊",
536
+ "Fundamental vs industry ⚖️",
537
+ "Performance per task category 📈",
538
+ "Performance per language 🌐",
539
+ ])
540
+ with all_lang_tabs[0]:
541
+ create_radar_chart(aggregated_df, "all_radar")
542
+ with all_lang_tabs[1]:
543
+ create_scatter_chart(aggregated_df, "all_scatter")
544
+ with all_lang_tabs[2]:
545
+ create_pie_chart(aggregated_df, "all_pie")
546
+ with all_lang_tabs[3]:
547
+ create_box_plot(aggregated_df, "all_box")
548
+ with all_lang_tabs[4]:
549
+ create_histogram(aggregated_df, "all_hist")
550
+ with all_lang_tabs[5]:
551
+ # Use the raw combined data so that professional task columns are available.
552
+ create_box_plot_per_task_category(raw_all_df, "all_box_task_cat")
553
+ with all_lang_tabs[6]:
554
+ create_box_plot_per_semantic_category(raw_all_df, "all_box_sem_cat")
555
+ with all_lang_tabs[7]:
556
+ create_box_plot_per_language("all_box_language")
557
+
558
+ # Results per language
559
+ st.markdown("---")
560
+ st.markdown("### Language ranking 🏆")
561
+ lang_choice = st.selectbox("Select a language 🌐:", list(lang_iber), key="lang_leaderboard")
562
+ if lang_choice == "Spanish":
563
+ variations = [k for k, v in lang_set.items() if v["category"] in ["Spanish Variations languages", "Mixed languages"]]
564
+ tabs_var = st.tabs(variations)
565
+ for var, tab in zip(variations, tabs_var):
566
+ with tab:
567
+ create_results_visualization_lang(var)
568
+ else:
569
+ create_results_visualization_lang(lang_choice)
570
+
571
+ elif menu == "Submit Model 🚀":
572
+ st.markdown("<div class='main-header'><h1>Submit Your Model 🚀</h1></div>", unsafe_allow_html=True)
573
+ st.markdown("## How to submit a model 📤")
574
+
575
+ # CSS
576
+ st.markdown("""
577
+ <style>
578
+ .card-container {
579
+ max-width: 300px;
580
+ margin: auto;
581
+ text-align: left;
582
+ font-size: 1rem;
583
+ padding: 0.5rem;
584
+ box-sizing: border-box;
585
+ }
586
+ .id-container {
587
+ display: flex;
588
  align-items: center;
589
+ margin-bottom: 1rem;
590
+ }
591
+ .id-circle {
592
+ width: 32px;
593
+ height: 32px;
594
+ border-radius: 50%;
595
+ display: flex;
596
+ align-items: center;
597
+ justify-content: center;
598
+ border: 1px solid #007BFF;
599
+ color: #007BFF;
600
+ font-size: 0.875rem;
601
+ font-weight: 600;
602
+ background-color: transparent;
603
+ margin-right: 8px;
604
+ }
605
+ .guide-content {
606
+ word-wrap: break-word;
607
+ }
608
+ .guide-title {
609
+ font-weight: bold;
610
+ font-size: 1rem;
611
+ margin-left: 8px;
612
+ }
613
+ </style>
614
+ """, unsafe_allow_html=True)
615
+
616
+ def render_card(content):
617
+ html = f"""
618
+ <div class="card-container">
619
+ <div class="guide-content">
620
+ {content}
621
+ </div>
622
+ </div>
623
+ """
624
+ return html
625
+
626
+ # Load your HTML content from files
627
  guide_info_list = []
628
  html_path = "assets/html"
629
+ filenames = sorted(os.listdir(html_path))
630
+ for filename in filenames:
631
  file_path = os.path.join(html_path, filename)
632
  with open(file_path, "r", encoding="utf-8") as file:
633
+ raw_html = file.read()
634
+ guide_info_list.append(raw_html)
635
+
636
+ # Create the grid
637
+ num_columns = 3
638
+ num_rows = 2
639
+
640
+ for row in range(num_rows):
641
+ cols = st.columns(num_columns)
642
+ for col in range(num_columns):
643
+ index = row * num_columns + col
644
+ if index < len(guide_info_list):
645
+ with cols[col]:
646
+ st.markdown(render_card(guide_info_list[index]), unsafe_allow_html=True)
647
+
648
+ st.markdown("## Submission form 📝")
649
+ with st.form("submit_model_form", clear_on_submit=True):
650
  model_name = st.text_input(
651
+ "Model Name (format: user_name/model_name) 🧩",
652
  help="Your model should be public on the Hub and follow the username/model-id format (e.g. mistralai/Mistral-7B-v0.1).",
653
  )
654
  description = st.text_area(
655
+ "Description ✍️",
656
+ help="Add a description of the proposed model for the evaluation to help prioritize its evaluation.",
 
 
 
 
657
  )
658
+ user_contact = st.text_input("Your Contact Email 📧", help="User e-mail to contact when there are updates.")
659
  precision_option = st.selectbox(
660
+ "Choose precision format 🔢:",
661
+ help="Size limits vary by precision. Choose carefully as incorrect precision can cause evaluation errors.",
662
  options=["float16", "bfloat16", "8bit", "4bit", "GPTQ"],
663
  index=0,
664
  )
665
  weight_type_option = st.selectbox(
666
+ "Select weight type ⚖️:",
667
+ help="Original: Complete model weights. Delta: Differences from base model. Adapter: Lightweight fine-tuning layers.",
668
  options=["Original", "Adapter", "Delta"],
669
  index=0,
670
  )
671
+ base_model_name = st.text_input("Base model (if applicable) 🏗️", help="Required for delta weights or adapters. This helps calculate total parameter count.", value="")
 
 
 
 
672
  model_type = st.selectbox(
673
+ "Choose model type 🔍:",
674
+ help="🟢 Pretrained: Base models, 🔶 Fine-tuned: Domain-specific, 💬 Chat: Conversational, 🤝 Merge: Combined weights.",
675
+ options=["🟢 Pretrained", "🔶 Fine-tuned", "💬 Chat", "🤝 Merge"],
 
 
 
 
 
676
  )
677
+ submit_button = st.form_submit_button("Submit Request 🚀")
 
678
  if submit_button:
 
679
  use_chat_template = True if model_type == "💬 Chat" else False
680
  validation_error = validate_model(
681
+ model_name, precision_option, base_model_name, weight_type_option, use_chat_template
 
 
 
 
682
  )
683
  if validation_error is not None:
684
  st.error(validation_error)
685
  elif not re.match(r"[^@]+@[^@]+\.[^@]+", user_contact):
686
+ st.error("Invalid email address ⚠️.")
687
  else:
688
  input_dict = {
689
  "model_name": model_name,
 
696
  }
697
  try:
698
  log_submission(input_dict)
699
+ st.success("Your request has been sent successfully 🎉.")
700
  except Exception as e:
701
+ st.error(f"Failed to send your request: {e}. Please try again later.")
702
+
703
+ elif menu == "Datasets 📚":
704
+ st.markdown("<div class='main-header'><h1>Dataset Information 📚</h1></div>", unsafe_allow_html=True)
705
+ st.markdown("### Check the datasets 🔍")
706
+ lang_iber = [k for k, v in lang_set.items() if v["category"] == "Iberian Peninsula languages"]
707
+ lang_choice = st.selectbox("Select a language 🌐:", list(lang_iber), key="lang_dataset")
708
+ if lang_choice == "Spanish":
709
+ variations = [k for k, v in lang_set.items() if v["category"] in ["Spanish Variations languages", "Mixed languages"]]
710
+ tabs_var = st.tabs(variations)
711
+ for var, tab in zip(variations, tabs_var):
712
+ with tab:
713
+ if var == "Mixed":
714
+ create_dataset_info_per_language("Spanish")
715
+ else:
716
+ create_dataset_info_per_language(var)
717
+ else:
718
+ create_dataset_info_per_language(lang_choice)
719
+ st.markdown("### Task mappings 🔄")
720
+ st.markdown("For the sake of completeness, here we show the mappings we use in the leaderboard to aggregate tasks.")
721
+ tab1, tab2 = st.tabs(["Semantic categories 🗂️", "Fundamental vs. Industry ⚖️"])
722
+ with tab1:
723
+ st.json({category: [task.removeprefix("iberbench/") for task in tasks] for category, tasks in semantic_categories.items()})
724
+ with tab2:
725
+ st.json({category: [task.removeprefix("iberbench/") for task in tasks] for category, tasks in professional_mapping.items()})
726
+
727
+ elif menu == "About ℹ️":
728
+ st.markdown("<div class='main-header'><h1>About ℹ️</h1></div>", unsafe_allow_html=True)
729
+ st.markdown("""### 📖 What is IberBench?
730
+ IberBench is hub comprised of datasets for languages across Iberian and Latin American regions, aimed to be used as a benchmark to evaluate causal language models. This initiative aims to enrich the Natural Language Processing (NLP) community in the Iberian Peninsula and Latin America. The benchmark enables the evaluation of NLP models in multiple Spanish variants and other languages such as Catalan, Galician, Basque, Portuguese, and Latin American Spanish, fostering assessments and developments that reflect the linguistic diversity of these regions.
731
+
732
+ We hope to drive multilingual research that considers the cultural and linguistic richness and complexity of the Spanish-speaking world, encouraging the creation of models that are truly representative of these realities.
733
+
734
+ ### 📂 What are the data sources?
735
+
736
+ IberBench contains datasets from prominent workshops in the field such as [IberLEF@SEPLN](https://sepln2024.infor.uva.es/eventos/iberlef-es/) or [PAN@CLEF](https://pan.webis.de/clef24/pan24-web/index.html), as well as stablished existing benchmarks as those from HiTZ, UPF, BSC, CiTIUS-USC, among others, with the aim to incorporate standardized and consistent evaluation within this context, enhancing the value of the data and models derived from this effort.
737
+
738
+ We strictly adhere to all established guidelines and regulations concerning the use and publication of this data. Specifically:
739
+
740
+ - The collected datasets are published on 🤗HuggingFace private repositories, with appropriate credit given to the authors in the model card.
741
+ - Under no circumstances we claim ownership of the datasets.
742
+ - The test splits of the datasets are kept private to avoid leakage from IberBench side.
743
+
744
+ In any publication or presentation resulting from work with this data, we recognize the importance of citing and crediting to the organizing teams that crafted the datasets used at IberBench.
745
+
746
+ ### 🙋 How can I join to IberBench?
747
+
748
+ IberBench comprises a committee composed of specialists in NLP, language ethics, and gender discrimination, drawn from both academia and industry, which will oversee the development of the project, ensuring its quality and relevance.
749
+
750
+ To be part of this committee, you can ask to join the [IberBench organization at 🤗HuggingFace](https://huggingface.co/iberbench). Your request will be validated by experts already belonging to the organization.
751
+
752
+ ### 🤝 How can I contribute to IberBench?
753
+
754
+ First, the initial committee will gather all the datasets from prominent workshops. From this, you can contribute with new datasets to the IberBench organization. The process is as follows:
755
+
756
+ 1. Open a new discussion in the [IberBench discussions space](https://huggingface.co/spaces/iberbench/README/discussions), linking to an existing dataset in the 🤗HuggingFace hub and explaining why the inclusion is relevant.
757
+ 2. Discuss with the committee for the approval or rejection of the dataset.
758
+ 3. If approval: your dataset will be included into the IberBench datasets, and will be used to evaluate LLMs in the IberBench leaderboard.
759
+
760
+ IberBench will never claim ownership over the dataset, the original author will receive all credits.
761
+
762
+ ### 💬 Social networks
763
+
764
+ You can reach us at:
765
+
766
+ - **X**: [https://x.com/IberBench](https://x.com/IberBench)
767
+ - **🤗 Discussions**: [https://huggingface.co/spaces/iberbench/README/discussions](https://huggingface.co/spaces/iberbench/README/discussions)
768
+
769
+ ### 🫶 Acknowledgements
770
+
771
+ We are incredibly grateful to the amazing teams behind the datasets from workshops like IberLEF, IberEval, and TASS under the umbrella of the [SEPLN](http://www.sepln.org/sepln), as well as the established benchmarks from HiTZ, UPF, BSC, CiTIUS-USC, among others. Their hard work and dedication to advancing NLP have made this benchmark possible. Huge thanks for sharing your invaluable resources with the community! 🚀👏
772
+
773
+ IberBench has been funded by the Valencian Institute for Business Competitiveness (IVACE). </br>
774
+
775
+ <style>
776
+ body {
777
+ margin: 0;
778
+ display: flex;
779
+ flex-direction: column;
780
+ min-height: 100vh;
781
+ }
782
+ .footer {
783
+ margin-top: auto;
784
+ display: flex;
785
+ flex-direction: column;
786
+ align-items: center;
787
+ text-align: center;
788
+ width: 100%;
789
+ background: white;
790
+ padding: 5px 0;
791
+ }
792
+ .footer p {
793
+ margin: 0;
794
+ font-size: 16px;
795
+ }
796
+ .logos {
797
+ display: flex;
798
+ justify-content: center;
799
+ align-items: center; /* Align images properly */
800
+ gap: 20px;
801
+ }
802
+ .logos img {
803
+ display: block;
804
+ margin: 0;
805
+ padding: 0;
806
+ max-height: 100px; /* Ensures both images have the same height */
807
+ width: auto; /* Keeps aspect ratio */
808
+ }
809
+ </style>
810
+ </br>
811
+ <div class="footer">
812
+ <p>Developed by Symanto with ❤️</p>
813
+ <div class="logos">
814
+ <img src="https://www.ivace.es/images/logo2-ivace.PNG">
815
+ <img src="https://www.symanto.com/wp-content/uploads/Logos/symanto.svg">
816
+ </div>
817
+ </div>
818
+ """, unsafe_allow_html=True)
assets/html/01_model_info.html CHANGED
@@ -1,12 +1,9 @@
1
- <div style="margin-left: 10px;">
2
- <h4 style="margin: 0; color: #007BFF;">Model Information</h4>
3
- <p>
4
- Your model should be <strong>public</strong> on the Hub and follow the
5
- <strong>username/model-id</strong> format (e.g., mistralai/Mistral-7B-v0.1).
6
- Specify the <strong>revision</strong> (commit hash or branch) and <strong>model type</strong>.
7
- </p>
8
- <a href="https://huggingface.co/docs/hub/models-uploading" target="_blank"
9
- style="color: #007BFF; text-decoration: underline; font-family: monospace;">
10
- Model uploading guide →
11
- </a>
12
- </div>
 
1
+ <h4 style="margin: 0; color: #007BFF;">Model Information</h4>
2
+ <p style="font-size: 0.875rem; color: #6c757d; margin: 0; line-height: 1.5;">
3
+ Your model should be <strong>public</strong> on the Hub and follow the
4
+ <strong>username/model-id</strong> format (e.g., mistralai/Mistral-7B-v0.1).
5
+ </p>
6
+ <a href="https://huggingface.co/docs/hub/models-uploading" target="_blank"
7
+ style="color: #007BFF; text-decoration: underline; font-family: monospace;">
8
+ Model uploading guide →
9
+ </a>
 
 
 
assets/html/02_technical_detail.html CHANGED
@@ -1,17 +1,5 @@
1
- <div style="margin-left: 10px;">
2
- <h4 style="margin: 0; color: #007BFF;">Technical Details</h4>
3
- <p style="font-size: 0.875rem; color: #6c757d; margin: 0; line-height: 1.5;">
4
- Make sure your model can be <strong>loaded locally</strong> before submitting:
5
- </p>
6
- <div style="background-color: #f5f5f5; padding: 1rem; border-radius: 5px; font-family: monospace; color: #212529;">
7
- <pre style="margin: 0; padding: 0; font-size: 1rem; white-space: pre-wrap; word-wrap: break-word;">
8
- <code>
9
- from transformers import AutoConfig, AutoModel, AutoTokenizer
10
- config = AutoConfig.from_pretrained("your-username/your-model", revision="main")
11
- model = AutoModel.from_pretrained("your-username/your-model", revision="main")
12
- tokenizer = AutoTokenizer.from_pretrained("your-username/your-model", revision="main")
13
- </code>
14
- </pre>
15
- </div>
16
- <a href="https://huggingface.co/docs/transformers/installation" target="_blank" style="color: #007BFF; text-decoration: underline; font-family: monospace;">Transformers documentation →</a>
17
- </div>
 
1
+ <h4 style="margin: 0; color: #007BFF;">Technical Details</h4>
2
+ <p style="font-size: 0.875rem; color: #6c757d; margin: 0; line-height: 1.5;">
3
+ Make sure your model can be <strong>loaded locally</strong> with `AutoModel` before submitting:
4
+ </p>
5
+ <a href="https://huggingface.co/docs/transformers/installation" target="_blank" style="color: #007BFF; text-decoration: underline; font-family: monospace;">Transformers documentation →</a>
 
 
 
 
 
 
 
 
 
 
 
 
assets/html/03_licenses.html ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <h4 style="margin: 0; color: #007BFF;">License Requirements</h4>
2
+ <p style="font-size: 0.875rem; color: #6c757d;">
3
+ A <strong>license tag</strong> is required. <strong>Open licenses</strong>
4
+ (Apache, MIT, etc) are strongly recommended.
5
+ </p>
6
+ <a href="https://huggingface.co/docs/hub/repositories-licenses" target="_blank" style="color: #007BFF; text-decoration: underline; font-family: monospace;">About model licenses →</a>
assets/html/03_linceses.html DELETED
@@ -1,8 +0,0 @@
1
- <div style="margin-left: 10px;">
2
- <h4 style="margin: 0; color: #007BFF;">License Requirements</h4>
3
- <p style="font-size: 0.875rem; color: #6c757d;">
4
- A <strong>license tag</strong> is required. <strong>Open licenses</strong>
5
- (Apache, MIT, etc) are strongly recommended.
6
- </p>
7
- <a href="https://huggingface.co/docs/hub/repositories-licenses" target="_blank" style="color: #007BFF; text-decoration: underline; font-family: monospace;">About model licenses →</a>
8
- </div>
 
 
 
 
 
 
 
 
 
assets/html/04_model_card.html CHANGED
@@ -1,9 +1,7 @@
1
- <div style="margin-left: 10px;">
2
- <h4 style="margin: 0; color: #007BFF;">Model Card Requirements</h4>
3
- <p style="font-size: 0.875rem; color: #6c757d;">
4
- Your model card must include: <strong>architecture</strong>,
5
- <strong>training details</strong>, <strong>dataset information</strong>, intended use, limitations, and
6
- <strong>performance metrics</strong>.
7
- </p>
8
- <a href="https://huggingface.co/docs/hub/model-cards" target="_blank" style="color: #007BFF; text-decoration: underline; font-family: monospace;">Model cards guide →</a>
9
- </div>
 
1
+ <h4 style="margin: 0; color: #007BFF;">Model Card</h4>
2
+ <p style="font-size: 0.875rem; color: #6c757d;">
3
+ Your model card must include: <strong>architecture</strong>,
4
+ <strong>training details</strong>, <strong>dataset information</strong>, intended use, limitations, and
5
+ <strong>performance metrics</strong>.
6
+ </p>
7
+ <a href="https://huggingface.co/docs/hub/model-cards" target="_blank" style="color: #007BFF; text-decoration: underline; font-family: monospace;">Model cards guide →</a>
 
 
assets/html/05_checklist.html CHANGED
@@ -1,8 +1,6 @@
1
- <div style="margin-left: 10px;">
2
- <h4 style="margin: 0; color: #007BFF;">Checklist</h4>
3
- <p style="font-size: 0.875rem; color: #6c757d;">
4
- Ensure your model is <strong>public</strong>, uses <strong>safetensors</strong> format,
5
- has a <strong>license tag</strong>, and <strong>loads correctly</strong> with the provided code.
6
- </p>
7
- <a href="https://huggingface.co/docs/hub/repositories-getting-started" target="_blank" style="color: #007BFF; text-decoration: underline; font-family: monospace;">Sharing best practices →</a>
8
- </div>
 
1
+ <h4 style="margin: 0; color: #007BFF;">Checklist</h4>
2
+ <p style="font-size: 0.875rem; color: #6c757d;">
3
+ Ensure your model is <strong>public</strong>, uses <strong>safetensors</strong> format,
4
+ has a <strong>license tag</strong>, and <strong>loads correctly</strong> with the provided code.
5
+ </p>
6
+ <a href="https://huggingface.co/docs/hub/repositories-getting-started" target="_blank" style="color: #007BFF; text-decoration: underline; font-family: monospace;">Sharing best practices →</a>
 
 
assets/html/06_ready.html ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <h4 style="margin: 0; color: #007BFF;">Ready!</h4>
2
+ <p style="font-size: 0.875rem; color: #6c757d;">
3
+ If you meet all these requirements, your model will be evaluated 🎉
4
+ </p>
src/task_mappings.py CHANGED
@@ -153,7 +153,7 @@ semantic_categories = {
153
  # Professional tasks are those that have economic interest for the industry.
154
  # Tasks like author profiling, sentiment analysis, machine-generated text detection, fake news detection, stance, discrimination, etc. are under this category.
155
  professional_mapping = {
156
- "Fundamental NLP": [
157
  "iberbench/iberlef-adobo-lexical_borrowing_chunking-2021-spanish",
158
  "iberbench/general-eus_exams-question_answering-2024-basque",
159
  "iberbench/general-eus_exams-question_answering-2024-spanish",
@@ -172,10 +172,10 @@ professional_mapping = {
172
  "iberbench/general-belebele-reading_comprehension-2024-spanish",
173
  "iberbench/general-escola-linguistic_acceptability-2024-spanish",
174
  "iberbench/general-xnli-textual_entailment-2024-spanish",
175
- "iberbench/general-xnli-textual_entailment-2024-galician",
176
  "iberbench/general-galcola-linguistic_acceptability-2024-galician",
177
- "iberbench/general-xstorycloze-question_answering-2024-galician",
178
- "iberbench/general-belebele-reading_comprehension-2024-portuguese",,
179
  "iberbench/general-xstorycloze-question_answering-2024-portuguese",
180
  "iberbench/general-xstorycloze-question_answering-2024-catalan",
181
  "iberbench/general-xnli-textual_entailment-2024-catalan",
@@ -185,8 +185,8 @@ professional_mapping = {
185
  "iberbench/general-copa-commonsense_reasoning-2024-catalan",
186
  "iberbench/general-catcola-linguistic_acceptability-2024-catalan",
187
  "iberbench/general-teca-textual_entailment-2021-catalan",
188
- ],
189
- "Professional NLP": [
190
  "iberbench/iberlef-restmex-sentiment_analysis-2022-spanish-mexico",
191
  "iberbench/iberlef-restmex-sentiment_analysis-2021-spanish-mexico"
192
  "iberbench/general-parafraseja-paraphrase_detection-2022-catalan",
@@ -258,4 +258,4 @@ professional_mapping = {
258
  "iberbench/pan-author_profiling-age_detection-2015-spanish",
259
  "iberbench/general-hate_check-hate_speech_detection-2024-portuguese"
260
  ]
261
- }
 
153
  # Professional tasks are those that have economic interest for the industry.
154
  # Tasks like author profiling, sentiment analysis, machine-generated text detection, fake news detection, stance, discrimination, etc. are under this category.
155
  professional_mapping = {
156
+ "Fundamental NLP": [
157
  "iberbench/iberlef-adobo-lexical_borrowing_chunking-2021-spanish",
158
  "iberbench/general-eus_exams-question_answering-2024-basque",
159
  "iberbench/general-eus_exams-question_answering-2024-spanish",
 
172
  "iberbench/general-belebele-reading_comprehension-2024-spanish",
173
  "iberbench/general-escola-linguistic_acceptability-2024-spanish",
174
  "iberbench/general-xnli-textual_entailment-2024-spanish",
175
+ "iberbench/general-xnli-textual_entailment-2024-galician",
176
  "iberbench/general-galcola-linguistic_acceptability-2024-galician",
177
+ "iberbench/general-xstorycloze-question_answering-2024-galician",
178
+ "iberbench/general-belebele-reading_comprehension-2024-portuguese",
179
  "iberbench/general-xstorycloze-question_answering-2024-portuguese",
180
  "iberbench/general-xstorycloze-question_answering-2024-catalan",
181
  "iberbench/general-xnli-textual_entailment-2024-catalan",
 
185
  "iberbench/general-copa-commonsense_reasoning-2024-catalan",
186
  "iberbench/general-catcola-linguistic_acceptability-2024-catalan",
187
  "iberbench/general-teca-textual_entailment-2021-catalan",
188
+ ],
189
+ "Industry NLP": [
190
  "iberbench/iberlef-restmex-sentiment_analysis-2022-spanish-mexico",
191
  "iberbench/iberlef-restmex-sentiment_analysis-2021-spanish-mexico"
192
  "iberbench/general-parafraseja-paraphrase_detection-2022-catalan",
 
258
  "iberbench/pan-author_profiling-age_detection-2015-spanish",
259
  "iberbench/general-hate_check-hate_speech_detection-2024-portuguese"
260
  ]
261
+ }