Spaces:
Running
Running
Commit
·
84010af
1
Parent(s):
1bcaf5a
add
Browse files- app.py +8 -86
- src/about.py +39 -0
app.py
CHANGED
@@ -11,6 +11,7 @@ from src.about import (
|
|
11 |
EVALUATION_QUEUE_TEXT,
|
12 |
INTRODUCTION_TEXT,
|
13 |
TASK_TEXT,
|
|
|
14 |
LLM_BENCHMARKS_TEXT,
|
15 |
TITLE,
|
16 |
)
|
@@ -138,96 +139,17 @@ with demo:
|
|
138 |
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
139 |
|
140 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
141 |
-
with gr.Column():
|
142 |
-
with gr.Row():
|
143 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
144 |
-
|
145 |
-
with gr.Column():
|
146 |
-
with gr.Accordion(
|
147 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
148 |
-
open=False,
|
149 |
-
):
|
150 |
-
with gr.Row():
|
151 |
-
finished_eval_table = gr.components.Dataframe(
|
152 |
-
value=finished_eval_queue_df,
|
153 |
-
headers=EVAL_COLS,
|
154 |
-
datatype=EVAL_TYPES,
|
155 |
-
row_count=5,
|
156 |
-
)
|
157 |
-
with gr.Accordion(
|
158 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
159 |
-
open=False,
|
160 |
-
):
|
161 |
-
with gr.Row():
|
162 |
-
running_eval_table = gr.components.Dataframe(
|
163 |
-
value=running_eval_queue_df,
|
164 |
-
headers=EVAL_COLS,
|
165 |
-
datatype=EVAL_TYPES,
|
166 |
-
row_count=5,
|
167 |
-
)
|
168 |
-
|
169 |
-
with gr.Accordion(
|
170 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
171 |
-
open=False,
|
172 |
-
):
|
173 |
-
with gr.Row():
|
174 |
-
pending_eval_table = gr.components.Dataframe(
|
175 |
-
value=pending_eval_queue_df,
|
176 |
-
headers=EVAL_COLS,
|
177 |
-
datatype=EVAL_TYPES,
|
178 |
-
row_count=5,
|
179 |
-
)
|
180 |
with gr.Row():
|
181 |
gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
|
182 |
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
|
|
|
|
|
|
187 |
|
188 |
-
with gr.Row():
|
189 |
-
with gr.Column():
|
190 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
191 |
-
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
192 |
-
model_type = gr.Dropdown(
|
193 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
194 |
-
label="Model type",
|
195 |
-
multiselect=False,
|
196 |
-
value=None,
|
197 |
-
interactive=True,
|
198 |
-
)
|
199 |
-
|
200 |
-
with gr.Column():
|
201 |
-
precision = gr.Dropdown(
|
202 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
203 |
-
label="Precision",
|
204 |
-
multiselect=False,
|
205 |
-
value="float16",
|
206 |
-
interactive=True,
|
207 |
-
)
|
208 |
-
weight_type = gr.Dropdown(
|
209 |
-
choices=[i.value.name for i in WeightType],
|
210 |
-
label="Weights type",
|
211 |
-
multiselect=False,
|
212 |
-
value="Original",
|
213 |
-
interactive=True,
|
214 |
-
)
|
215 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
216 |
-
|
217 |
-
submit_button = gr.Button("Submit Eval")
|
218 |
-
submission_result = gr.Markdown()
|
219 |
-
submit_button.click(
|
220 |
-
add_new_eval,
|
221 |
-
[
|
222 |
-
model_name_textbox,
|
223 |
-
base_model_name_textbox,
|
224 |
-
revision_name_textbox,
|
225 |
-
precision,
|
226 |
-
weight_type,
|
227 |
-
model_type,
|
228 |
-
],
|
229 |
-
submission_result,
|
230 |
-
)
|
231 |
|
232 |
with gr.Row():
|
233 |
# gr.Markdown()
|
|
|
11 |
EVALUATION_QUEUE_TEXT,
|
12 |
INTRODUCTION_TEXT,
|
13 |
TASK_TEXT,
|
14 |
+
SUBMIT_TEMPLATE,
|
15 |
LLM_BENCHMARKS_TEXT,
|
16 |
TITLE,
|
17 |
)
|
|
|
139 |
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
140 |
|
141 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
with gr.Row():
|
143 |
gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
|
144 |
|
145 |
+
gr.Markdown("## Submission Template", elem_classes="markdown-text")
|
146 |
+
gr.Markdown(SUBMIT_TEMPLATE, elem_classes="markdown-text", height=250)
|
147 |
+
|
148 |
+
file_input = gr.File(label="Upload JSON File", file_types=[".json"], height=150)
|
149 |
+
json_output = gr.JSON(label="Parsed JSON Data") # 输出 JSON 数据
|
150 |
+
submit_button = gr.Button("Submit")
|
151 |
+
submit_button.click(fn=process_json, inputs=file_input, outputs=json_output)
|
152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
with gr.Row():
|
155 |
# gr.Markdown()
|
src/about.py
CHANGED
@@ -54,6 +54,45 @@ TASK_TEXT = {
|
|
54 |
'Drop_Quote': 'The Drop Quote task comprises a grid of multiple rows and columns, with each column providing a set of candidate letters. The task requires determining the correct row for letters in each column, effectively "dropping" it into target place to reveal the hidden quotation. We created 50 easy samples by manually compiling common quotations, and collected 50 hard samples from <a href="https://www.printable-puzzles.com/printable-drop-quotes.php" target="_blank"> Printable Puzzles</a>, with timestamps ranging from September 2024 to December 2024.'
|
55 |
}
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
# Which evaluations are you running? how can people reproduce what you have?
|
58 |
LLM_BENCHMARKS_TEXT = f"""
|
59 |
## How it works
|
|
|
54 |
'Drop_Quote': 'The Drop Quote task comprises a grid of multiple rows and columns, with each column providing a set of candidate letters. The task requires determining the correct row for letters in each column, effectively "dropping" it into target place to reveal the hidden quotation. We created 50 easy samples by manually compiling common quotations, and collected 50 hard samples from <a href="https://www.printable-puzzles.com/printable-drop-quotes.php" target="_blank"> Printable Puzzles</a>, with timestamps ranging from September 2024 to December 2024.'
|
55 |
}
|
56 |
|
57 |
+
SUBMIT_TEMPLATE = """
|
58 |
+
```python
|
59 |
+
{
|
60 |
+
"config": {
|
61 |
+
"model_name": "deepseek-ai/DeepSeek-R1", # your model name
|
62 |
+
"link": "https://huggingface.co/deepseek-ai/DeepSeek-R1", # your model link if available
|
63 |
+
"Params": 671, # number of parameters if available
|
64 |
+
"show_on_leaderboard": true, # whether to show your model on the leaderboard
|
65 |
+
},
|
66 |
+
"results": {
|
67 |
+
"Acrostic": {
|
68 |
+
"TAG1": "RESPONSE1",
|
69 |
+
"TAG2": "RESPONSE2",
|
70 |
+
},
|
71 |
+
"Crossword": {
|
72 |
+
"TAG1": "RESPONSE1",
|
73 |
+
"TAG2": "RESPONSE2",
|
74 |
+
},
|
75 |
+
"Cryptogram": {
|
76 |
+
"TAG1": "RESPONSE1",
|
77 |
+
"TAG2": "RESPONSE2",
|
78 |
+
},
|
79 |
+
"Logic_Puzzle": {
|
80 |
+
"TAG1": "RESPONSE1",
|
81 |
+
"TAG2": "RESPONSE2",
|
82 |
+
},
|
83 |
+
"Sudoku": {
|
84 |
+
"TAG1": "RESPONSE1",
|
85 |
+
"TAG2": "RESPONSE2",
|
86 |
+
},
|
87 |
+
"Drop_Quote": {
|
88 |
+
"TAG1": "RESPONSE1",
|
89 |
+
"TAG2": "RESPONSE2",
|
90 |
+
}
|
91 |
+
}
|
92 |
+
}
|
93 |
+
```
|
94 |
+
"""
|
95 |
+
|
96 |
# Which evaluations are you running? how can people reproduce what you have?
|
97 |
LLM_BENCHMARKS_TEXT = f"""
|
98 |
## How it works
|