Alexandre Piche commited on
Commit
c119a86
·
1 Parent(s): 33a819c

update leaderboard

Browse files
Files changed (5) hide show
  1. README.md +8 -38
  2. app.py +298 -180
  3. content.py +50 -0
  4. requirements.txt +2 -13
  5. scorer.py +104 -0
README.md CHANGED
@@ -1,46 +1,16 @@
1
  ---
2
- title: Leaderboard
3
- emoji: 🥇
4
- colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
  app_file: app.py
8
  pinned: true
9
  license: apache-2.0
10
- short_description: Duplicate this leaderboard to initialize your own!
11
- sdk_version: 5.19.0
 
 
12
  ---
13
 
14
- # Start the configuration
15
-
16
- Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
17
-
18
- Results files should have the following format and be stored as json files:
19
- ```json
20
- {
21
- "config": {
22
- "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
23
- "model_name": "path of the model on the hub: org/model",
24
- "model_sha": "revision on the hub",
25
- },
26
- "results": {
27
- "task_name": {
28
- "metric_name": score,
29
- },
30
- "task_name2": {
31
- "metric_name": score,
32
- }
33
- }
34
- }
35
- ```
36
-
37
- Request files are created automatically by this tool.
38
-
39
- If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
40
-
41
- # Code logic for more complex edits
42
-
43
- You'll find
44
- - the main table' columns names and properties in `src/display/utils.py`
45
- - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
46
- - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
1
  ---
2
+ title: GAIA Leaderboard
3
+ emoji: 🦾
4
+ colorFrom: yellow
5
  colorTo: indigo
6
  sdk: gradio
7
  app_file: app.py
8
  pinned: true
9
  license: apache-2.0
10
+ hf_oauth: true
11
+ failure_strategy: rollback
12
+ tags:
13
+ - leaderboard
14
  ---
15
 
16
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,204 +1,322 @@
 
 
 
 
 
 
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
 
 
 
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
 
32
  def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
 
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
- )
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- demo = gr.Blocks(css=custom_css)
93
- with demo:
94
- gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
 
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
 
 
103
 
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
 
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  )
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  with gr.Row():
192
  with gr.Accordion("📙 Citation", open=False):
193
  citation_button = gr.Textbox(
194
  value=CITATION_BUTTON_TEXT,
195
  label=CITATION_BUTTON_LABEL,
196
- lines=20,
197
  elem_id="citation-button",
198
- show_copy_button=True,
199
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
  scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
1
+ import os
2
+ import json
3
+ import datetime
4
+ import requests
5
+ from email.utils import parseaddr
6
+
7
  import gradio as gr
 
8
  import pandas as pd
9
+ import numpy as np
10
+
11
+ from datasets import load_dataset, VerificationMode
12
  from apscheduler.schedulers.background import BackgroundScheduler
13
+ from huggingface_hub import HfApi
14
+
15
+ # InfoStrings
16
+ from scorer import question_scorer
17
+ from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink
18
+
19
+ TOKEN = os.environ.get("TOKEN", None)
20
+
21
+ OWNER="financebench"
22
+ DATA_DATASET = f"{OWNER}/finance-events-latest"
23
+ INTERNAL_DATA_DATASET = f"{OWNER}/finance-events-latest"
24
+ SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
25
+ SUBMISSION_DATASET_PUBLIC = f"{OWNER}/submissions_public"
26
+ CONTACT_DATASET = f"{OWNER}/contact_info"
27
+ RESULTS_DATASET = f"{OWNER}/results"
28
+ LEADERBOARD_PATH = f"{OWNER}/leaderboard"
29
+ api = HfApi()
30
+
31
+ YEAR_VERSION = ""
32
+ ref_scores_len = {"valid": 165, "test": 301}
33
+ ref_level_len = {"valid": {1: 53, 2: 86, 3: 26}, "test": {1: 93, 2: 159, 3: 49}}
34
+
35
+ os.makedirs("scored", exist_ok=True)
36
+
37
+ # Should be False on spaces and True outside
38
+ LOCAL_DEBUG = False #not (os.environ.get("system") == "spaces")
39
+
40
+ # Display the results
41
+ eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
42
+ contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
43
+ def get_dataframe_from_results(eval_results, split):
44
+ local_df = eval_results[split]
45
+ local_df = local_df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])})
46
+ local_df = local_df.remove_columns(["system_prompt", "url"])
47
+ local_df = local_df.rename_column("model", "Agent name")
48
+ local_df = local_df.rename_column("model_family", "Model family")
49
+ local_df = local_df.rename_column("score", "Return (%)")
50
+ local_df = local_df.rename_column("date", "Submission date")
51
+ df = pd.DataFrame(local_df)
52
+ df = df.sort_values(by=["Return (%)"], ascending=False)
53
+
54
+ numeric_cols = [c for c in local_df.column_names if "score" in c]
55
+ df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
56
+ #df = df.style.format("{:.2%}", subset=numeric_cols)
57
+
58
+ return df
59
+
60
+ eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="valid")
61
+ eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
62
+
63
+ # Gold answers
64
+ gold_results = {}
65
+ gold_dataset = load_dataset(INTERNAL_DATA_DATASET, "", token=TOKEN, trust_remote_code=True)
66
+ gold_results = {split: {row["task_id"]: row for row in gold_dataset[split]} for split in ["test", "valid"]}
67
 
68
 
69
  def restart_space():
70
+ api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
71
 
72
+ TYPES = ["markdown", "number", "number", "number", "number", "str", "str", "str"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ def add_new_eval(
75
+ val_or_test: str,
76
+ model: str,
77
+ model_family: str,
78
+ system_prompt: str,
79
+ url: str,
80
+ path_to_file: str,
81
+ organisation: str,
82
+ mail: str,
83
+ profile: gr.OAuthProfile,
84
+ ):
85
+ # Was the profile created less than 2 month ago?
86
+ user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
87
+ creation_date = json.loads(user_data.content)["createdAt"]
88
+ if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
89
+ return format_error("This account is not authorized to submit on FinanceBench.")
90
+
91
 
92
+ contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
93
+ user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
94
+ if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
95
+ return format_error("You already submitted once today, please try again tomorrow.")
96
 
 
 
 
97
 
98
+ is_valid = val_or_test == "valid"
99
+ # Very basic email parsing
100
+ _, parsed_mail = parseaddr(mail)
101
+ if not "@" in parsed_mail:
102
+ return format_warning("Please provide a valid email adress.")
103
 
104
+ print("Adding new eval")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ # Check if the combination model/org already exists and prints a warning message if yes
107
+ if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organisation.lower() in set([o.lower() for o in eval_results[val_or_test]["organisation"]]):
108
+ return format_warning("This model has been already submitted.")
109
+
110
+ if path_to_file is None:
111
+ return format_warning("Please attach a file.")
112
+
113
+ # SAVE UNSCORED SUBMISSION
114
+ if LOCAL_DEBUG:
115
+ print("mock uploaded submission")
116
+ else:
117
+ api.upload_file(
118
+ repo_id=SUBMISSION_DATASET,
119
+ path_or_fileobj=path_to_file.name,
120
+ path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
121
+ repo_type="dataset",
122
+ token=TOKEN
123
+ )
124
+
125
+ # SAVE CONTACT
126
+ contact_info = {
127
+ "model": model,
128
+ "model_family": model_family,
129
+ "url": url,
130
+ "organisation": organisation,
131
+ "username": profile.username,
132
+ "mail": mail,
133
+ "date": datetime.datetime.today().strftime('%Y-%m-%d')
134
+ }
135
+ contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
136
+ if LOCAL_DEBUG:
137
+ print("mock uploaded contact info")
138
+ else:
139
+ contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN)
140
+
141
+ # SCORE SUBMISSION
142
+ file_path = path_to_file.name
143
+ scores = {"all": 0, 1: 0, 2: 0, 3: 0}
144
+ num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
145
+ task_ids = []
146
+ with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
147
+ with open(file_path, 'r') as f:
148
+ for ix, line in enumerate(f):
149
+ try:
150
+ task = json.loads(line)
151
+ except Exception:
152
+ return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
153
+
154
+ if "model_answer" not in task:
155
+ return format_error(f"Line {ix} contains no model_answer key. Please fix it and resubmit your file.")
156
+ answer = task["model_answer"]
157
+ task_id = task["task_id"]
158
+ try:
159
+ level = int(gold_results[val_or_test][task_id]["Level"])
160
+ except KeyError:
161
+ return format_error(f"{task_id} not found in split {val_or_test}. Are you sure you submitted the correct file?")
162
+
163
+ score = question_scorer(task['model_answer'], gold_results[val_or_test][task_id]["Final answer"])
164
+
165
+ scored_file.write(
166
+ json.dumps({
167
+ "id": task_id,
168
+ "model_answer": answer,
169
+ "score": score,
170
+ "level": level
171
+ }) + "\n"
172
+ )
173
+ task_ids.append(task_id)
174
+
175
+ scores["all"] += score
176
+ scores[level] += score
177
+ num_questions["all"] += 1
178
+ num_questions[level] += 1
179
+
180
+ # Check if there's any duplicate in the submission
181
+ if len(task_ids) != len(set(task_ids)):
182
+ return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
183
+
184
+ if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
185
+ return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
186
+
187
+ # SAVE SCORED SUBMISSION
188
+ if LOCAL_DEBUG:
189
+ print("mock uploaded scored submission")
190
+ else:
191
+ api.upload_file(
192
+ repo_id=SUBMISSION_DATASET,
193
+ path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
194
+ path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
195
+ repo_type="dataset",
196
+ token=TOKEN
197
+ )
198
+
199
+ # Save scored file
200
+ if is_valid:
201
+ api.upload_file(
202
+ repo_id=SUBMISSION_DATASET_PUBLIC,
203
+ path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
204
+ path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
205
+ repo_type="dataset",
206
+ token=TOKEN
207
  )
208
 
209
+ # SAVE TO LEADERBOARD DATA
210
+ eval_entry = {
211
+ "model": model,
212
+ "model_family": model_family,
213
+ "system_prompt": system_prompt,
214
+ "url": url,
215
+ "organisation": organisation,
216
+ "score": scores["all"]/ref_scores_len[val_or_test],
217
+ "date": datetime.datetime.today().strftime('%Y-%m-%d')
218
+ }
219
+ if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]:
220
+ return format_error(f"Your submission has {len(scores['all'])} questions for the {val_or_test} set, but it should have {ref_scores_len[val_or_test]}. Please check your submission.")
221
+ # Catching spam submissions of 100%
222
+
223
+ # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
224
+ #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
225
+ #columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"]
226
+ #if eval_entry_no_date in eval_results[val_or_test].select_columns(columns_no_date):
227
+ # return format_error(f"Your submission is an exact duplicate from an existing submission.")
228
+
229
+ eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
230
+ print(eval_results)
231
+ if LOCAL_DEBUG:
232
+ print("mock uploaded results to lb")
233
+ else:
234
+ eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
235
+
236
+
237
+ return format_log(f"Model {model} submitted by {organisation} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")
238
+
239
+
240
+ def refresh():
241
+ eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS,trust_remote_code=True)
242
+ eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="valid")
243
+ eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
244
+ return eval_dataframe_val, eval_dataframe_test
245
+
246
+ def upload_file(files):
247
+ file_paths = [file.name for file in files]
248
+ return file_paths
249
+
250
+
251
+ demo = gr.Blocks()
252
+ with demo:
253
+ gr.HTML(TITLE)
254
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
255
+
256
  with gr.Row():
257
  with gr.Accordion("📙 Citation", open=False):
258
  citation_button = gr.Textbox(
259
  value=CITATION_BUTTON_TEXT,
260
  label=CITATION_BUTTON_LABEL,
 
261
  elem_id="citation-button",
262
+ ) #.style(show_copy_button=True)
263
+
264
+ with gr.Tab("Results: Test"):
265
+ leaderboard_table_test = gr.components.Dataframe(
266
+ value=eval_dataframe_test, datatype=TYPES, interactive=False,
267
+ column_widths=["20%"]
268
+ )
269
+ with gr.Tab("Results: valid"):
270
+ leaderboard_table_val = gr.components.Dataframe(
271
+ value=eval_dataframe_val, datatype=TYPES, interactive=False,
272
+ column_widths=["20%"]
273
+ )
274
+
275
+ refresh_button = gr.Button("Refresh")
276
+ refresh_button.click(
277
+ refresh,
278
+ inputs=[],
279
+ outputs=[
280
+ leaderboard_table_val,
281
+ leaderboard_table_test,
282
+ ],
283
+ )
284
+ with gr.Accordion("Submit a new model for evaluation"):
285
+ with gr.Row():
286
+ gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
287
+ with gr.Row():
288
+ with gr.Column():
289
+ level_of_test = gr.Radio(["valid", "test"], value="valid", label="Split")
290
+ model_name_textbox = gr.Textbox(label="Agent name")
291
+ model_family_textbox = gr.Textbox(label="Model family")
292
+ system_prompt_textbox = gr.Textbox(label="System prompt example")
293
+ url_textbox = gr.Textbox(label="Url to model information")
294
+ with gr.Column():
295
+ organisation = gr.Textbox(label="Organisation")
296
+ mail = gr.Textbox(label="Contact email (will be stored privately, & used if there is an issue with your submission)")
297
+ file_output = gr.File()
298
+
299
+
300
+ with gr.Row():
301
+ gr.LoginButton()
302
+ submit_button = gr.Button("Submit Eval")
303
+ submission_result = gr.Markdown()
304
+ submit_button.click(
305
+ add_new_eval,
306
+ [
307
+ level_of_test,
308
+ model_name_textbox,
309
+ model_family_textbox,
310
+ system_prompt_textbox,
311
+ url_textbox,
312
+ file_output,
313
+ organisation,
314
+ mail
315
+ ],
316
+ submission_result,
317
+ )
318
 
319
  scheduler = BackgroundScheduler()
320
+ scheduler.add_job(restart_space, "interval", seconds=3600)
321
  scheduler.start()
322
+ demo.launch(debug=True)
content.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TITLE = """<h1 align="center" id="space-title">FinanceBench Leaderboard</h1>"""
2
+
3
+ INTRODUCTION_TEXT = """
4
+ FinanceBench is a benchmark which aims at evaluating next-generation LLMs (LLMs with augmented capabilities due to added tooling, efficient prompting, access to search, etc). (See our [paper](https://arxiv.org/abs/2311.12983) for more details.)
5
+
6
+ ## Leaderboard
7
+ Submission made by our team are labelled "FinanceBench authors". While we report average scores over different runs when possible in our paper, we only report the best run in the leaderboard.
8
+
9
+ See below for submissions.
10
+ """
11
+
12
+ SUBMISSION_TEXT = """
13
+ ## Submissions
14
+ Results can be submitted for both validation and test. Scores are expressed as the percentage of correct answers for a given split.
15
+
16
+ Each question calls for an answer that is either a string (one or a few words), a number, or a comma separated list of strings or floats, unless specified otherwise. There is only one correct answer.
17
+ Hence, evaluation is done via quasi exact match between a model’s answer and the ground truth (up to some normalization that is tied to the “type” of the ground truth).
18
+
19
+ In our evaluation, we use a system prompt to instruct the model about the required format:
20
+ ```
21
+ You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
22
+ ```
23
+ We advise you to use the system prompt provided in the paper to ensure your agents answer using the correct and expected format. In practice, GPT4 level models easily follow it.
24
+
25
+
26
+ We expect submissions to be json-line files with the following format. The first two fields are mandatory, `reasoning_trace` is optional:
27
+ ```
28
+ {"task_id": "task_id_1", "model_answer": "Answer 1 from your model", "reasoning_trace": "The different steps by which your model reached answer 1"}
29
+ {"task_id": "task_id_2", "model_answer": "Answer 2 from your model", "reasoning_trace": "The different steps by which your model reached answer 2"}
30
+ ```
31
+
32
+ """
33
+
34
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
35
+ CITATION_BUTTON_TEXT = r"""@misc{
36
+ }"""
37
+
38
+
39
+ def format_error(msg):
40
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
41
+
42
+ def format_warning(msg):
43
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
44
+
45
+ def format_log(msg):
46
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
47
+
48
+ def model_hyperlink(link, model_name):
49
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
50
+
requirements.txt CHANGED
@@ -1,16 +1,5 @@
1
- APScheduler
2
- black
3
  datasets
4
  gradio
5
- gradio[oauth]
6
- gradio_leaderboard==0.0.13
7
- gradio_client
8
- huggingface-hub>=0.18.0
9
- matplotlib
10
  numpy
11
- pandas
12
- python-dateutil
13
- tqdm
14
- transformers
15
- tokenizers>=0.15.0
16
- sentencepiece
 
 
 
1
  datasets
2
  gradio
3
+ huggingface-hub
 
 
 
 
4
  numpy
5
+ APScheduler
 
 
 
 
 
scorer.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import string
4
+ import warnings
5
+
6
+ import numpy as np
7
+
8
+
9
+ def normalize_number_str(number_str: str) -> float:
10
+ # we replace these common units and commas to allow
11
+ # conversion to float
12
+ for char in ["$", "%", ","]:
13
+ number_str = number_str.replace(char, "")
14
+ try:
15
+ return float(number_str)
16
+ except ValueError:
17
+ print(f"String {number_str} cannot be normalized to number str.")
18
+ return float("inf")
19
+
20
+
21
+ def split_string(
22
+ s: str,
23
+ char_list: list[str] = [",", ";"],
24
+ ) -> list[str]:
25
+ pattern = f"[{''.join(char_list)}]"
26
+ return re.split(pattern, s)
27
+
28
+
29
+ def question_scorer(
30
+ model_answer: str,
31
+ ground_truth: str,
32
+ ) -> bool:
33
+ def is_float(element: any) -> bool:
34
+ try:
35
+ float(element)
36
+ return True
37
+ except ValueError:
38
+ return False
39
+
40
+ if model_answer is None:
41
+ model_answer = "None"
42
+
43
+ # if gt is a number
44
+ if is_float(ground_truth):
45
+ print(f"Evaluating {model_answer} as a number.")
46
+ normalized_answer = normalize_number_str(model_answer)
47
+ return normalized_answer == float(ground_truth)
48
+
49
+ # if gt is a list
50
+ elif any(char in ground_truth for char in [",", ";"]):
51
+ print(f"Evaluating {model_answer} as a comma separated list.")
52
+ # question with the fish: normalization removes punct
53
+
54
+ gt_elems = split_string(ground_truth)
55
+ ma_elems = split_string(model_answer)
56
+
57
+ # check length is the same
58
+ if len(gt_elems) != len(ma_elems):
59
+ warnings.warn(
60
+ "Answer lists have different lengths, returning False.", UserWarning
61
+ )
62
+ return False
63
+
64
+ # compare each element as float or str
65
+ comparisons = []
66
+ for ma_elem, gt_elem in zip(ma_elems, gt_elems):
67
+ if is_float(gt_elem):
68
+ normalized_ma_elem = normalize_number_str(ma_elem)
69
+ comparisons.append(normalized_ma_elem == float(gt_elem))
70
+ else:
71
+ # we do not remove punct since comparisons can include punct
72
+ comparisons.append(
73
+ normalize_str(ma_elem, remove_punct=False)
74
+ == normalize_str(gt_elem, remove_punct=False)
75
+ )
76
+ return all(comparisons)
77
+
78
+ # if gt is a str
79
+ else:
80
+ print(f"Evaluating {model_answer} as a string.")
81
+ return normalize_str(model_answer) == normalize_str(ground_truth)
82
+
83
+
84
+ def normalize_str(input_str, remove_punct=True) -> str:
85
+ """
86
+ Normalize a string by:
87
+ - Removing all white spaces
88
+ - Optionally removing punctuation (if remove_punct is True)
89
+ - Converting to lowercase
90
+ Parameters:
91
+ - input_str: str, the string to normalize
92
+ - remove_punct: bool, whether to remove punctuation (default: True)
93
+ Returns:
94
+ - str, the normalized string
95
+ """
96
+ # Remove all white spaces. Required e.g for seagull vs. sea gull
97
+ no_spaces = re.sub(r"\s", "", input_str)
98
+
99
+ # Remove punctuation, if specified.
100
+ if remove_punct:
101
+ translator = str.maketrans("", "", string.punctuation)
102
+ return no_spaces.lower().translate(translator)
103
+ else:
104
+ return no_spaces.lower()