Spaces:
Running
Running
Alexandre Piche
commited on
Commit
·
c119a86
1
Parent(s):
33a819c
update leaderboard
Browse files- README.md +8 -38
- app.py +298 -180
- content.py +50 -0
- requirements.txt +2 -13
- scorer.py +104 -0
README.md
CHANGED
@@ -1,46 +1,16 @@
|
|
1 |
---
|
2 |
-
title: Leaderboard
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
license: apache-2.0
|
10 |
-
|
11 |
-
|
|
|
|
|
12 |
---
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
|
17 |
-
|
18 |
-
Results files should have the following format and be stored as json files:
|
19 |
-
```json
|
20 |
-
{
|
21 |
-
"config": {
|
22 |
-
"model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
|
23 |
-
"model_name": "path of the model on the hub: org/model",
|
24 |
-
"model_sha": "revision on the hub",
|
25 |
-
},
|
26 |
-
"results": {
|
27 |
-
"task_name": {
|
28 |
-
"metric_name": score,
|
29 |
-
},
|
30 |
-
"task_name2": {
|
31 |
-
"metric_name": score,
|
32 |
-
}
|
33 |
-
}
|
34 |
-
}
|
35 |
-
```
|
36 |
-
|
37 |
-
Request files are created automatically by this tool.
|
38 |
-
|
39 |
-
If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
|
40 |
-
|
41 |
-
# Code logic for more complex edits
|
42 |
-
|
43 |
-
You'll find
|
44 |
-
- the main table' columns names and properties in `src/display/utils.py`
|
45 |
-
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
46 |
-
- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
|
|
1 |
---
|
2 |
+
title: GAIA Leaderboard
|
3 |
+
emoji: 🦾
|
4 |
+
colorFrom: yellow
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
license: apache-2.0
|
10 |
+
hf_oauth: true
|
11 |
+
failure_strategy: rollback
|
12 |
+
tags:
|
13 |
+
- leaderboard
|
14 |
---
|
15 |
|
16 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -1,204 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
-
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
import pandas as pd
|
|
|
|
|
|
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
-
from huggingface_hub import
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
|
32 |
def restart_space():
|
33 |
-
|
34 |
|
35 |
-
|
36 |
-
try:
|
37 |
-
print(EVAL_REQUESTS_PATH)
|
38 |
-
snapshot_download(
|
39 |
-
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
40 |
-
)
|
41 |
-
except Exception:
|
42 |
-
restart_space()
|
43 |
-
try:
|
44 |
-
print(EVAL_RESULTS_PATH)
|
45 |
-
snapshot_download(
|
46 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
47 |
-
)
|
48 |
-
except Exception:
|
49 |
-
restart_space()
|
50 |
-
|
51 |
-
|
52 |
-
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
53 |
-
|
54 |
-
(
|
55 |
-
finished_eval_queue_df,
|
56 |
-
running_eval_queue_df,
|
57 |
-
pending_eval_queue_df,
|
58 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
-
|
60 |
-
def init_leaderboard(dataframe):
|
61 |
-
if dataframe is None or dataframe.empty:
|
62 |
-
raise ValueError("Leaderboard DataFrame is empty or None.")
|
63 |
-
return Leaderboard(
|
64 |
-
value=dataframe,
|
65 |
-
datatype=[c.type for c in fields(AutoEvalColumn)],
|
66 |
-
select_columns=SelectColumns(
|
67 |
-
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
68 |
-
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
-
label="Select Columns to Display:",
|
70 |
-
),
|
71 |
-
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
72 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
-
filter_columns=[
|
74 |
-
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
-
ColumnFilter(
|
77 |
-
AutoEvalColumn.params.name,
|
78 |
-
type="slider",
|
79 |
-
min=0.01,
|
80 |
-
max=150,
|
81 |
-
label="Select the number of parameters (B)",
|
82 |
-
),
|
83 |
-
ColumnFilter(
|
84 |
-
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
85 |
-
),
|
86 |
-
],
|
87 |
-
bool_checkboxgroup_label="Hide models",
|
88 |
-
interactive=False,
|
89 |
-
)
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
|
97 |
-
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
-
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
99 |
-
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
100 |
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
103 |
|
104 |
-
|
105 |
-
with gr.Column():
|
106 |
-
with gr.Row():
|
107 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
108 |
-
|
109 |
-
with gr.Column():
|
110 |
-
with gr.Accordion(
|
111 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
112 |
-
open=False,
|
113 |
-
):
|
114 |
-
with gr.Row():
|
115 |
-
finished_eval_table = gr.components.Dataframe(
|
116 |
-
value=finished_eval_queue_df,
|
117 |
-
headers=EVAL_COLS,
|
118 |
-
datatype=EVAL_TYPES,
|
119 |
-
row_count=5,
|
120 |
-
)
|
121 |
-
with gr.Accordion(
|
122 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
123 |
-
open=False,
|
124 |
-
):
|
125 |
-
with gr.Row():
|
126 |
-
running_eval_table = gr.components.Dataframe(
|
127 |
-
value=running_eval_queue_df,
|
128 |
-
headers=EVAL_COLS,
|
129 |
-
datatype=EVAL_TYPES,
|
130 |
-
row_count=5,
|
131 |
-
)
|
132 |
-
|
133 |
-
with gr.Accordion(
|
134 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
135 |
-
open=False,
|
136 |
-
):
|
137 |
-
with gr.Row():
|
138 |
-
pending_eval_table = gr.components.Dataframe(
|
139 |
-
value=pending_eval_queue_df,
|
140 |
-
headers=EVAL_COLS,
|
141 |
-
datatype=EVAL_TYPES,
|
142 |
-
row_count=5,
|
143 |
-
)
|
144 |
-
with gr.Row():
|
145 |
-
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
146 |
-
|
147 |
-
with gr.Row():
|
148 |
-
with gr.Column():
|
149 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
150 |
-
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
151 |
-
model_type = gr.Dropdown(
|
152 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
153 |
-
label="Model type",
|
154 |
-
multiselect=False,
|
155 |
-
value=None,
|
156 |
-
interactive=True,
|
157 |
-
)
|
158 |
-
|
159 |
-
with gr.Column():
|
160 |
-
precision = gr.Dropdown(
|
161 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
162 |
-
label="Precision",
|
163 |
-
multiselect=False,
|
164 |
-
value="float16",
|
165 |
-
interactive=True,
|
166 |
-
)
|
167 |
-
weight_type = gr.Dropdown(
|
168 |
-
choices=[i.value.name for i in WeightType],
|
169 |
-
label="Weights type",
|
170 |
-
multiselect=False,
|
171 |
-
value="Original",
|
172 |
-
interactive=True,
|
173 |
-
)
|
174 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
175 |
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
)
|
190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
with gr.Row():
|
192 |
with gr.Accordion("📙 Citation", open=False):
|
193 |
citation_button = gr.Textbox(
|
194 |
value=CITATION_BUTTON_TEXT,
|
195 |
label=CITATION_BUTTON_LABEL,
|
196 |
-
lines=20,
|
197 |
elem_id="citation-button",
|
198 |
-
|
199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
|
201 |
scheduler = BackgroundScheduler()
|
202 |
-
scheduler.add_job(restart_space, "interval", seconds=
|
203 |
scheduler.start()
|
204 |
-
demo.
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import datetime
|
4 |
+
import requests
|
5 |
+
from email.utils import parseaddr
|
6 |
+
|
7 |
import gradio as gr
|
|
|
8 |
import pandas as pd
|
9 |
+
import numpy as np
|
10 |
+
|
11 |
+
from datasets import load_dataset, VerificationMode
|
12 |
from apscheduler.schedulers.background import BackgroundScheduler
|
13 |
+
from huggingface_hub import HfApi
|
14 |
+
|
15 |
+
# InfoStrings
|
16 |
+
from scorer import question_scorer
|
17 |
+
from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink
|
18 |
+
|
19 |
+
TOKEN = os.environ.get("TOKEN", None)
|
20 |
+
|
21 |
+
OWNER="financebench"
|
22 |
+
DATA_DATASET = f"{OWNER}/finance-events-latest"
|
23 |
+
INTERNAL_DATA_DATASET = f"{OWNER}/finance-events-latest"
|
24 |
+
SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
|
25 |
+
SUBMISSION_DATASET_PUBLIC = f"{OWNER}/submissions_public"
|
26 |
+
CONTACT_DATASET = f"{OWNER}/contact_info"
|
27 |
+
RESULTS_DATASET = f"{OWNER}/results"
|
28 |
+
LEADERBOARD_PATH = f"{OWNER}/leaderboard"
|
29 |
+
api = HfApi()
|
30 |
+
|
31 |
+
YEAR_VERSION = ""
|
32 |
+
ref_scores_len = {"valid": 165, "test": 301}
|
33 |
+
ref_level_len = {"valid": {1: 53, 2: 86, 3: 26}, "test": {1: 93, 2: 159, 3: 49}}
|
34 |
+
|
35 |
+
os.makedirs("scored", exist_ok=True)
|
36 |
+
|
37 |
+
# Should be False on spaces and True outside
|
38 |
+
LOCAL_DEBUG = False #not (os.environ.get("system") == "spaces")
|
39 |
+
|
40 |
+
# Display the results
|
41 |
+
eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
|
42 |
+
contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
|
43 |
+
def get_dataframe_from_results(eval_results, split):
|
44 |
+
local_df = eval_results[split]
|
45 |
+
local_df = local_df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])})
|
46 |
+
local_df = local_df.remove_columns(["system_prompt", "url"])
|
47 |
+
local_df = local_df.rename_column("model", "Agent name")
|
48 |
+
local_df = local_df.rename_column("model_family", "Model family")
|
49 |
+
local_df = local_df.rename_column("score", "Return (%)")
|
50 |
+
local_df = local_df.rename_column("date", "Submission date")
|
51 |
+
df = pd.DataFrame(local_df)
|
52 |
+
df = df.sort_values(by=["Return (%)"], ascending=False)
|
53 |
+
|
54 |
+
numeric_cols = [c for c in local_df.column_names if "score" in c]
|
55 |
+
df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
|
56 |
+
#df = df.style.format("{:.2%}", subset=numeric_cols)
|
57 |
+
|
58 |
+
return df
|
59 |
+
|
60 |
+
eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="valid")
|
61 |
+
eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
|
62 |
+
|
63 |
+
# Gold answers
|
64 |
+
gold_results = {}
|
65 |
+
gold_dataset = load_dataset(INTERNAL_DATA_DATASET, "", token=TOKEN, trust_remote_code=True)
|
66 |
+
gold_results = {split: {row["task_id"]: row for row in gold_dataset[split]} for split in ["test", "valid"]}
|
67 |
|
68 |
|
69 |
def restart_space():
|
70 |
+
api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
|
71 |
|
72 |
+
TYPES = ["markdown", "number", "number", "number", "number", "str", "str", "str"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
+
def add_new_eval(
|
75 |
+
val_or_test: str,
|
76 |
+
model: str,
|
77 |
+
model_family: str,
|
78 |
+
system_prompt: str,
|
79 |
+
url: str,
|
80 |
+
path_to_file: str,
|
81 |
+
organisation: str,
|
82 |
+
mail: str,
|
83 |
+
profile: gr.OAuthProfile,
|
84 |
+
):
|
85 |
+
# Was the profile created less than 2 month ago?
|
86 |
+
user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
|
87 |
+
creation_date = json.loads(user_data.content)["createdAt"]
|
88 |
+
if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
|
89 |
+
return format_error("This account is not authorized to submit on FinanceBench.")
|
90 |
+
|
91 |
|
92 |
+
contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
|
93 |
+
user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
|
94 |
+
if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
|
95 |
+
return format_error("You already submitted once today, please try again tomorrow.")
|
96 |
|
|
|
|
|
|
|
97 |
|
98 |
+
is_valid = val_or_test == "valid"
|
99 |
+
# Very basic email parsing
|
100 |
+
_, parsed_mail = parseaddr(mail)
|
101 |
+
if not "@" in parsed_mail:
|
102 |
+
return format_warning("Please provide a valid email adress.")
|
103 |
|
104 |
+
print("Adding new eval")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
+
# Check if the combination model/org already exists and prints a warning message if yes
|
107 |
+
if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organisation.lower() in set([o.lower() for o in eval_results[val_or_test]["organisation"]]):
|
108 |
+
return format_warning("This model has been already submitted.")
|
109 |
+
|
110 |
+
if path_to_file is None:
|
111 |
+
return format_warning("Please attach a file.")
|
112 |
+
|
113 |
+
# SAVE UNSCORED SUBMISSION
|
114 |
+
if LOCAL_DEBUG:
|
115 |
+
print("mock uploaded submission")
|
116 |
+
else:
|
117 |
+
api.upload_file(
|
118 |
+
repo_id=SUBMISSION_DATASET,
|
119 |
+
path_or_fileobj=path_to_file.name,
|
120 |
+
path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
|
121 |
+
repo_type="dataset",
|
122 |
+
token=TOKEN
|
123 |
+
)
|
124 |
+
|
125 |
+
# SAVE CONTACT
|
126 |
+
contact_info = {
|
127 |
+
"model": model,
|
128 |
+
"model_family": model_family,
|
129 |
+
"url": url,
|
130 |
+
"organisation": organisation,
|
131 |
+
"username": profile.username,
|
132 |
+
"mail": mail,
|
133 |
+
"date": datetime.datetime.today().strftime('%Y-%m-%d')
|
134 |
+
}
|
135 |
+
contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
|
136 |
+
if LOCAL_DEBUG:
|
137 |
+
print("mock uploaded contact info")
|
138 |
+
else:
|
139 |
+
contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN)
|
140 |
+
|
141 |
+
# SCORE SUBMISSION
|
142 |
+
file_path = path_to_file.name
|
143 |
+
scores = {"all": 0, 1: 0, 2: 0, 3: 0}
|
144 |
+
num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
|
145 |
+
task_ids = []
|
146 |
+
with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
|
147 |
+
with open(file_path, 'r') as f:
|
148 |
+
for ix, line in enumerate(f):
|
149 |
+
try:
|
150 |
+
task = json.loads(line)
|
151 |
+
except Exception:
|
152 |
+
return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
|
153 |
+
|
154 |
+
if "model_answer" not in task:
|
155 |
+
return format_error(f"Line {ix} contains no model_answer key. Please fix it and resubmit your file.")
|
156 |
+
answer = task["model_answer"]
|
157 |
+
task_id = task["task_id"]
|
158 |
+
try:
|
159 |
+
level = int(gold_results[val_or_test][task_id]["Level"])
|
160 |
+
except KeyError:
|
161 |
+
return format_error(f"{task_id} not found in split {val_or_test}. Are you sure you submitted the correct file?")
|
162 |
+
|
163 |
+
score = question_scorer(task['model_answer'], gold_results[val_or_test][task_id]["Final answer"])
|
164 |
+
|
165 |
+
scored_file.write(
|
166 |
+
json.dumps({
|
167 |
+
"id": task_id,
|
168 |
+
"model_answer": answer,
|
169 |
+
"score": score,
|
170 |
+
"level": level
|
171 |
+
}) + "\n"
|
172 |
+
)
|
173 |
+
task_ids.append(task_id)
|
174 |
+
|
175 |
+
scores["all"] += score
|
176 |
+
scores[level] += score
|
177 |
+
num_questions["all"] += 1
|
178 |
+
num_questions[level] += 1
|
179 |
+
|
180 |
+
# Check if there's any duplicate in the submission
|
181 |
+
if len(task_ids) != len(set(task_ids)):
|
182 |
+
return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
|
183 |
+
|
184 |
+
if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
|
185 |
+
return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
|
186 |
+
|
187 |
+
# SAVE SCORED SUBMISSION
|
188 |
+
if LOCAL_DEBUG:
|
189 |
+
print("mock uploaded scored submission")
|
190 |
+
else:
|
191 |
+
api.upload_file(
|
192 |
+
repo_id=SUBMISSION_DATASET,
|
193 |
+
path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
|
194 |
+
path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
|
195 |
+
repo_type="dataset",
|
196 |
+
token=TOKEN
|
197 |
+
)
|
198 |
+
|
199 |
+
# Save scored file
|
200 |
+
if is_valid:
|
201 |
+
api.upload_file(
|
202 |
+
repo_id=SUBMISSION_DATASET_PUBLIC,
|
203 |
+
path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
|
204 |
+
path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
|
205 |
+
repo_type="dataset",
|
206 |
+
token=TOKEN
|
207 |
)
|
208 |
|
209 |
+
# SAVE TO LEADERBOARD DATA
|
210 |
+
eval_entry = {
|
211 |
+
"model": model,
|
212 |
+
"model_family": model_family,
|
213 |
+
"system_prompt": system_prompt,
|
214 |
+
"url": url,
|
215 |
+
"organisation": organisation,
|
216 |
+
"score": scores["all"]/ref_scores_len[val_or_test],
|
217 |
+
"date": datetime.datetime.today().strftime('%Y-%m-%d')
|
218 |
+
}
|
219 |
+
if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]:
|
220 |
+
return format_error(f"Your submission has {len(scores['all'])} questions for the {val_or_test} set, but it should have {ref_scores_len[val_or_test]}. Please check your submission.")
|
221 |
+
# Catching spam submissions of 100%
|
222 |
+
|
223 |
+
# Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
|
224 |
+
#eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
|
225 |
+
#columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"]
|
226 |
+
#if eval_entry_no_date in eval_results[val_or_test].select_columns(columns_no_date):
|
227 |
+
# return format_error(f"Your submission is an exact duplicate from an existing submission.")
|
228 |
+
|
229 |
+
eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
|
230 |
+
print(eval_results)
|
231 |
+
if LOCAL_DEBUG:
|
232 |
+
print("mock uploaded results to lb")
|
233 |
+
else:
|
234 |
+
eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
|
235 |
+
|
236 |
+
|
237 |
+
return format_log(f"Model {model} submitted by {organisation} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")
|
238 |
+
|
239 |
+
|
240 |
+
def refresh():
|
241 |
+
eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS,trust_remote_code=True)
|
242 |
+
eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="valid")
|
243 |
+
eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
|
244 |
+
return eval_dataframe_val, eval_dataframe_test
|
245 |
+
|
246 |
+
def upload_file(files):
|
247 |
+
file_paths = [file.name for file in files]
|
248 |
+
return file_paths
|
249 |
+
|
250 |
+
|
251 |
+
demo = gr.Blocks()
|
252 |
+
with demo:
|
253 |
+
gr.HTML(TITLE)
|
254 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
255 |
+
|
256 |
with gr.Row():
|
257 |
with gr.Accordion("📙 Citation", open=False):
|
258 |
citation_button = gr.Textbox(
|
259 |
value=CITATION_BUTTON_TEXT,
|
260 |
label=CITATION_BUTTON_LABEL,
|
|
|
261 |
elem_id="citation-button",
|
262 |
+
) #.style(show_copy_button=True)
|
263 |
+
|
264 |
+
with gr.Tab("Results: Test"):
|
265 |
+
leaderboard_table_test = gr.components.Dataframe(
|
266 |
+
value=eval_dataframe_test, datatype=TYPES, interactive=False,
|
267 |
+
column_widths=["20%"]
|
268 |
+
)
|
269 |
+
with gr.Tab("Results: valid"):
|
270 |
+
leaderboard_table_val = gr.components.Dataframe(
|
271 |
+
value=eval_dataframe_val, datatype=TYPES, interactive=False,
|
272 |
+
column_widths=["20%"]
|
273 |
+
)
|
274 |
+
|
275 |
+
refresh_button = gr.Button("Refresh")
|
276 |
+
refresh_button.click(
|
277 |
+
refresh,
|
278 |
+
inputs=[],
|
279 |
+
outputs=[
|
280 |
+
leaderboard_table_val,
|
281 |
+
leaderboard_table_test,
|
282 |
+
],
|
283 |
+
)
|
284 |
+
with gr.Accordion("Submit a new model for evaluation"):
|
285 |
+
with gr.Row():
|
286 |
+
gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
|
287 |
+
with gr.Row():
|
288 |
+
with gr.Column():
|
289 |
+
level_of_test = gr.Radio(["valid", "test"], value="valid", label="Split")
|
290 |
+
model_name_textbox = gr.Textbox(label="Agent name")
|
291 |
+
model_family_textbox = gr.Textbox(label="Model family")
|
292 |
+
system_prompt_textbox = gr.Textbox(label="System prompt example")
|
293 |
+
url_textbox = gr.Textbox(label="Url to model information")
|
294 |
+
with gr.Column():
|
295 |
+
organisation = gr.Textbox(label="Organisation")
|
296 |
+
mail = gr.Textbox(label="Contact email (will be stored privately, & used if there is an issue with your submission)")
|
297 |
+
file_output = gr.File()
|
298 |
+
|
299 |
+
|
300 |
+
with gr.Row():
|
301 |
+
gr.LoginButton()
|
302 |
+
submit_button = gr.Button("Submit Eval")
|
303 |
+
submission_result = gr.Markdown()
|
304 |
+
submit_button.click(
|
305 |
+
add_new_eval,
|
306 |
+
[
|
307 |
+
level_of_test,
|
308 |
+
model_name_textbox,
|
309 |
+
model_family_textbox,
|
310 |
+
system_prompt_textbox,
|
311 |
+
url_textbox,
|
312 |
+
file_output,
|
313 |
+
organisation,
|
314 |
+
mail
|
315 |
+
],
|
316 |
+
submission_result,
|
317 |
+
)
|
318 |
|
319 |
scheduler = BackgroundScheduler()
|
320 |
+
scheduler.add_job(restart_space, "interval", seconds=3600)
|
321 |
scheduler.start()
|
322 |
+
demo.launch(debug=True)
|
content.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TITLE = """<h1 align="center" id="space-title">FinanceBench Leaderboard</h1>"""
|
2 |
+
|
3 |
+
INTRODUCTION_TEXT = """
|
4 |
+
FinanceBench is a benchmark which aims at evaluating next-generation LLMs (LLMs with augmented capabilities due to added tooling, efficient prompting, access to search, etc). (See our [paper](https://arxiv.org/abs/2311.12983) for more details.)
|
5 |
+
|
6 |
+
## Leaderboard
|
7 |
+
Submission made by our team are labelled "FinanceBench authors". While we report average scores over different runs when possible in our paper, we only report the best run in the leaderboard.
|
8 |
+
|
9 |
+
See below for submissions.
|
10 |
+
"""
|
11 |
+
|
12 |
+
SUBMISSION_TEXT = """
|
13 |
+
## Submissions
|
14 |
+
Results can be submitted for both validation and test. Scores are expressed as the percentage of correct answers for a given split.
|
15 |
+
|
16 |
+
Each question calls for an answer that is either a string (one or a few words), a number, or a comma separated list of strings or floats, unless specified otherwise. There is only one correct answer.
|
17 |
+
Hence, evaluation is done via quasi exact match between a model’s answer and the ground truth (up to some normalization that is tied to the “type” of the ground truth).
|
18 |
+
|
19 |
+
In our evaluation, we use a system prompt to instruct the model about the required format:
|
20 |
+
```
|
21 |
+
You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
22 |
+
```
|
23 |
+
We advise you to use the system prompt provided in the paper to ensure your agents answer using the correct and expected format. In practice, GPT4 level models easily follow it.
|
24 |
+
|
25 |
+
|
26 |
+
We expect submissions to be json-line files with the following format. The first two fields are mandatory, `reasoning_trace` is optional:
|
27 |
+
```
|
28 |
+
{"task_id": "task_id_1", "model_answer": "Answer 1 from your model", "reasoning_trace": "The different steps by which your model reached answer 1"}
|
29 |
+
{"task_id": "task_id_2", "model_answer": "Answer 2 from your model", "reasoning_trace": "The different steps by which your model reached answer 2"}
|
30 |
+
```
|
31 |
+
|
32 |
+
"""
|
33 |
+
|
34 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
35 |
+
CITATION_BUTTON_TEXT = r"""@misc{
|
36 |
+
}"""
|
37 |
+
|
38 |
+
|
39 |
+
def format_error(msg):
|
40 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
|
41 |
+
|
42 |
+
def format_warning(msg):
|
43 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
|
44 |
+
|
45 |
+
def format_log(msg):
|
46 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
|
47 |
+
|
48 |
+
def model_hyperlink(link, model_name):
|
49 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
50 |
+
|
requirements.txt
CHANGED
@@ -1,16 +1,5 @@
|
|
1 |
-
APScheduler
|
2 |
-
black
|
3 |
datasets
|
4 |
gradio
|
5 |
-
|
6 |
-
gradio_leaderboard==0.0.13
|
7 |
-
gradio_client
|
8 |
-
huggingface-hub>=0.18.0
|
9 |
-
matplotlib
|
10 |
numpy
|
11 |
-
|
12 |
-
python-dateutil
|
13 |
-
tqdm
|
14 |
-
transformers
|
15 |
-
tokenizers>=0.15.0
|
16 |
-
sentencepiece
|
|
|
|
|
|
|
1 |
datasets
|
2 |
gradio
|
3 |
+
huggingface-hub
|
|
|
|
|
|
|
|
|
4 |
numpy
|
5 |
+
APScheduler
|
|
|
|
|
|
|
|
|
|
scorer.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import re
|
3 |
+
import string
|
4 |
+
import warnings
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
|
9 |
+
def normalize_number_str(number_str: str) -> float:
|
10 |
+
# we replace these common units and commas to allow
|
11 |
+
# conversion to float
|
12 |
+
for char in ["$", "%", ","]:
|
13 |
+
number_str = number_str.replace(char, "")
|
14 |
+
try:
|
15 |
+
return float(number_str)
|
16 |
+
except ValueError:
|
17 |
+
print(f"String {number_str} cannot be normalized to number str.")
|
18 |
+
return float("inf")
|
19 |
+
|
20 |
+
|
21 |
+
def split_string(
|
22 |
+
s: str,
|
23 |
+
char_list: list[str] = [",", ";"],
|
24 |
+
) -> list[str]:
|
25 |
+
pattern = f"[{''.join(char_list)}]"
|
26 |
+
return re.split(pattern, s)
|
27 |
+
|
28 |
+
|
29 |
+
def question_scorer(
|
30 |
+
model_answer: str,
|
31 |
+
ground_truth: str,
|
32 |
+
) -> bool:
|
33 |
+
def is_float(element: any) -> bool:
|
34 |
+
try:
|
35 |
+
float(element)
|
36 |
+
return True
|
37 |
+
except ValueError:
|
38 |
+
return False
|
39 |
+
|
40 |
+
if model_answer is None:
|
41 |
+
model_answer = "None"
|
42 |
+
|
43 |
+
# if gt is a number
|
44 |
+
if is_float(ground_truth):
|
45 |
+
print(f"Evaluating {model_answer} as a number.")
|
46 |
+
normalized_answer = normalize_number_str(model_answer)
|
47 |
+
return normalized_answer == float(ground_truth)
|
48 |
+
|
49 |
+
# if gt is a list
|
50 |
+
elif any(char in ground_truth for char in [",", ";"]):
|
51 |
+
print(f"Evaluating {model_answer} as a comma separated list.")
|
52 |
+
# question with the fish: normalization removes punct
|
53 |
+
|
54 |
+
gt_elems = split_string(ground_truth)
|
55 |
+
ma_elems = split_string(model_answer)
|
56 |
+
|
57 |
+
# check length is the same
|
58 |
+
if len(gt_elems) != len(ma_elems):
|
59 |
+
warnings.warn(
|
60 |
+
"Answer lists have different lengths, returning False.", UserWarning
|
61 |
+
)
|
62 |
+
return False
|
63 |
+
|
64 |
+
# compare each element as float or str
|
65 |
+
comparisons = []
|
66 |
+
for ma_elem, gt_elem in zip(ma_elems, gt_elems):
|
67 |
+
if is_float(gt_elem):
|
68 |
+
normalized_ma_elem = normalize_number_str(ma_elem)
|
69 |
+
comparisons.append(normalized_ma_elem == float(gt_elem))
|
70 |
+
else:
|
71 |
+
# we do not remove punct since comparisons can include punct
|
72 |
+
comparisons.append(
|
73 |
+
normalize_str(ma_elem, remove_punct=False)
|
74 |
+
== normalize_str(gt_elem, remove_punct=False)
|
75 |
+
)
|
76 |
+
return all(comparisons)
|
77 |
+
|
78 |
+
# if gt is a str
|
79 |
+
else:
|
80 |
+
print(f"Evaluating {model_answer} as a string.")
|
81 |
+
return normalize_str(model_answer) == normalize_str(ground_truth)
|
82 |
+
|
83 |
+
|
84 |
+
def normalize_str(input_str, remove_punct=True) -> str:
|
85 |
+
"""
|
86 |
+
Normalize a string by:
|
87 |
+
- Removing all white spaces
|
88 |
+
- Optionally removing punctuation (if remove_punct is True)
|
89 |
+
- Converting to lowercase
|
90 |
+
Parameters:
|
91 |
+
- input_str: str, the string to normalize
|
92 |
+
- remove_punct: bool, whether to remove punctuation (default: True)
|
93 |
+
Returns:
|
94 |
+
- str, the normalized string
|
95 |
+
"""
|
96 |
+
# Remove all white spaces. Required e.g for seagull vs. sea gull
|
97 |
+
no_spaces = re.sub(r"\s", "", input_str)
|
98 |
+
|
99 |
+
# Remove punctuation, if specified.
|
100 |
+
if remove_punct:
|
101 |
+
translator = str.maketrans("", "", string.punctuation)
|
102 |
+
return no_spaces.lower().translate(translator)
|
103 |
+
else:
|
104 |
+
return no_spaces.lower()
|