Spaces:
Running
Running
File size: 5,443 Bytes
1bcaf5a 76a3bfd 3a8cf08 76a3bfd 0dda3bd 84010af 76a3bfd 3a8cf08 76a3bfd 3a8cf08 76a3bfd e38dcf1 76a3bfd e38dcf1 76a3bfd 3a8cf08 dd7ade0 3a8cf08 76a3bfd fbd3675 0637d24 dbc1b2d 6992c96 f5894fd dbc1b2d cd73003 466c028 cd73003 76a3bfd cd73003 466c028 545a4a4 76a3bfd 3a8cf08 1bcaf5a 3a8cf08 1bcaf5a cd73003 3a8cf08 76a3bfd 6992c96 76a3bfd 1301ce8 dd7ade0 c048789 dd7ade0 76a3bfd 1301ce8 abe371d 1bcaf5a 84010af 2db9522 84010af 3a8cf08 84010af 3a8cf08 abe371d 76a3bfd 24c2d4a 76a3bfd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import json
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
from datasets import load_dataset
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
TASK_TEXT,
SUBMIT_TEMPLATE,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
BENCHMARK_COLS,
COLS,
EVAL_COLS,
EVAL_TYPES,
AutoEvalColumn,
ModelType,
fields,
WeightType,
Precision
)
from src.envs import API, EVAL_RESULTS_PATH, GOLDEN_REPO, REPO_ID, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df
from src.submission.submit import add_new_eval
from src.evaluation import evaluate
import pdb
def restart_space():
API.restart_space(repo_id=REPO_ID)
### Space initialisation
# try:
# print(EVAL_REQUESTS_PATH)
# snapshot_download(
# repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
# )
# except Exception:
# restart_space()
# try:
# print(EVAL_RESULTS_PATH)
# snapshot_download(
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
# )
# except Exception:
# restart_space()
try:
golden = load_dataset(GOLDEN_REPO, token=TOKEN)
print(golden)
except Exception:
restart_space()
task = ['Overall', 'Crossword', 'Acrostic', 'Logic_Puzzle', 'Cryptogram', 'Sudoku', 'Drop_Quote']
leaderboard_dict = {}
for t in task:
leaderboard_dict[t] = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, task=t)
def init_leaderboard(dataframe):
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
# pdb.set_trace()
def highlight_max_bold(s):
return ['font-weight: bold' if v == s.max() and v != s.min() else '' for v in s]
num_cols = dataframe.select_dtypes(include=['float']).columns
styler = dataframe.style.format({col: "{:.1f}" for col in num_cols})
styler = styler.apply(highlight_max_bold, subset=num_cols)
return gr.components.Dataframe(
value=styler,
headers=[c.name for c in fields(AutoEvalColumn)],
datatype=[c.type for c in fields(AutoEvalColumn)],
row_count=10,
interactive=False,
column_widths=[180, 60, 80, 80, 80, 80, 60],
)
def eval_json(file):
try:
with open(file.name, 'r', encoding='utf-8') as f:
data = json.load(f)
tasks = ["crossword", "acrostic", "logic", "cryptogram", "sudoku", "drop"]
eval_dict = {}
for task in tasks:
data_list = data["results"][task]
golden_list = golden[task]
result = evaluate(data_list, golden_list, task)
eval_dict[task] = result
return json.dumps(eval_dict, indent=4)
except Exception as e:
return str(e)
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_id="main-tabs", elem_classes="tab-buttons") as tabs:
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
# leaderboard = init_leaderboard(LEADERBOARD_DF)
with gr.Tabs():
for i, t in enumerate(task):
with gr.TabItem(t.replace("_", " "), elem_id=f"llm-benchmark-tab-table-{t}", id=i):
if TASK_TEXT.get(t, None):
gr.Markdown(TASK_TEXT[t], elem_classes="markdown-text")
leaderboard = init_leaderboard(leaderboard_dict[t])
# with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
with gr.Row():
gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
gr.Markdown("## Submission Template", elem_classes="markdown-text")
gr.Markdown("See [submission_template.json](https://github.com/Ultramarine-spec/LR2Bench/blob/main/submission_template.json) for detail. The following is an example for the JSON structure.", elem_classes="markdown-text")
gr.Markdown(SUBMIT_TEMPLATE, elem_classes="markdown-text", height=250)
file_input = gr.File(label="Upload JSON File", file_types=[".json"], height=150)
json_output = gr.JSON(label="Your Model Performance") # 输出 JSON 数据
submit_button = gr.Button("Submit")
submit_button.click(fn=eval_json, inputs=file_input, outputs=json_output)
with gr.Row():
# gr.Markdown()
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
elem_id="citation-button",
show_copy_button=True,
)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch() |