Update app.py
Browse files
app.py
CHANGED
@@ -2,115 +2,155 @@ import gradio as gr
|
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
|
5 |
-
#
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
92.5, 94.2, 83.4, 87.2, 81.5,
|
25 |
-
76.8, 79.3, 75.2, 89.1, 93.7
|
26 |
-
],
|
27 |
-
"f1_score": [
|
28 |
-
90.1, 91.3, 79.8, 86.5, 78.3,
|
29 |
-
72.1, 74.5, 70.8, 87.4, 90.8
|
30 |
-
],
|
31 |
-
"cer": [
|
32 |
-
0.31, 0.13, 1.20, 0.20, 0.67,
|
33 |
-
0.54, 0.58, 0.95, 0.24, 0.15
|
34 |
-
],
|
35 |
-
"downloads": [
|
36 |
-
"24.5K", "18.2K", "152K", "89K", "112K",
|
37 |
-
"320K", "235K", "45K", "580K", "12.8K"
|
38 |
-
],
|
39 |
-
"last_updated": [
|
40 |
-
"2025-03-15", "2025-03-10", "2025-03-05", "2025-02-28", "2025-02-20",
|
41 |
-
"2025-02-15", "2025-02-10", "2025-02-05", "2025-01-28", "2025-01-15"
|
42 |
-
],
|
43 |
-
"model_url": [
|
44 |
-
"https://huggingface.co/openai/gpt-4o",
|
45 |
-
"https://huggingface.co/google/gemini-2-flash",
|
46 |
-
"https://huggingface.co/Qwen/Qwen2.5-VL-7B",
|
47 |
-
"https://huggingface.co/MBZUAI/AIN-7B",
|
48 |
-
"https://huggingface.co/google/paligemma-3b",
|
49 |
-
"https://huggingface.co/microsoft/trocr-large-printed",
|
50 |
-
"https://huggingface.co/facebook/nougat-base",
|
51 |
-
"https://huggingface.co/MBZUAI/KITAB-OCR",
|
52 |
-
"https://huggingface.co/meta-llama/Llama-3-70B-Vision",
|
53 |
-
"https://huggingface.co/anthropic/claude-3-opus"
|
54 |
-
],
|
55 |
-
"paper_url": [
|
56 |
-
"https://arxiv.org/abs/2412.xxxxx",
|
57 |
-
"https://arxiv.org/abs/2403.xxxxx",
|
58 |
-
"https://arxiv.org/abs/2410.xxxxx",
|
59 |
-
"https://arxiv.org/abs/2502.xxxxx",
|
60 |
-
"https://arxiv.org/abs/2305.xxxxx",
|
61 |
-
"https://arxiv.org/abs/2109.10282",
|
62 |
-
"https://arxiv.org/abs/2308.13418",
|
63 |
-
"https://arxiv.org/abs/2502.14949",
|
64 |
-
"https://arxiv.org/abs/2405.xxxxx",
|
65 |
-
"https://arxiv.org/abs/2404.xxxxx"
|
66 |
-
]
|
67 |
-
}
|
68 |
|
69 |
-
#
|
70 |
-
|
|
|
|
|
71 |
|
72 |
-
#
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
formatted_df['cer'] = formatted_df['cer'].apply(
|
88 |
-
lambda x: f"<span style='color: {'#10B981' if x < 0.5 else '#F59E0B' if x < 1 else '#EF4444'}'>{x:.2f}</span>"
|
89 |
-
)
|
90 |
-
|
91 |
-
# Add hyperlinks for model and paper
|
92 |
-
formatted_df['model'] = formatted_df.apply(
|
93 |
-
lambda row: f"<a href='{row['model_url']}' target='_blank'>{row['model']}</a>", axis=1
|
94 |
-
)
|
95 |
-
|
96 |
-
formatted_df['paper'] = formatted_df.apply(
|
97 |
-
lambda row: f"<a href='{row['paper_url']}' target='_blank'>Paper</a>", axis=1
|
98 |
-
)
|
99 |
-
|
100 |
-
# Add type badge
|
101 |
-
formatted_df['type'] = formatted_df['type'].apply(
|
102 |
-
lambda x: f"<span style='background-color: {'#DBEAFE' if x == 'Open-source' else '#FEF3C7'}; padding: 2px 6px; border-radius: 9999px; font-size: 0.75rem;'>{x}</span>"
|
103 |
-
)
|
104 |
|
105 |
-
#
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
|
|
109 |
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
112 |
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
# Define CSS for styling
|
116 |
css = """
|
@@ -127,15 +167,6 @@ css = """
|
|
127 |
.gradio-container {
|
128 |
max-width: 1200px !important;
|
129 |
}
|
130 |
-
.hf-logo {
|
131 |
-
display: flex;
|
132 |
-
align-items: center;
|
133 |
-
justify-content: center;
|
134 |
-
margin-bottom: 1rem;
|
135 |
-
}
|
136 |
-
.hf-logo img {
|
137 |
-
height: 50px;
|
138 |
-
}
|
139 |
.header {
|
140 |
background: linear-gradient(90deg, #FFDE59 0%, #FFC532 100%);
|
141 |
padding: 20px;
|
@@ -184,6 +215,9 @@ th {
|
|
184 |
font-weight: 600;
|
185 |
color: #374151;
|
186 |
border-bottom: 1px solid #E5E7EB;
|
|
|
|
|
|
|
187 |
}
|
188 |
td {
|
189 |
padding: 12px;
|
@@ -217,53 +251,222 @@ a:hover {
|
|
217 |
.footer a:hover {
|
218 |
text-decoration: underline;
|
219 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
"""
|
221 |
|
222 |
-
#
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
</
|
234 |
-
|
|
|
|
|
|
|
|
|
235 |
|
236 |
-
# Function to filter
|
237 |
-
def filter_by_type(
|
238 |
if type_filter == "All":
|
239 |
-
return
|
240 |
-
return
|
241 |
|
242 |
-
# Function to filter
|
243 |
-
def filter_by_search(
|
244 |
if not search_term:
|
245 |
-
return
|
246 |
|
247 |
# Convert search term to lowercase for case-insensitive search
|
248 |
search_term = search_term.lower()
|
249 |
|
250 |
# Filter based on model, organization, or task
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
|
|
256 |
|
257 |
-
return
|
258 |
|
259 |
-
#
|
260 |
-
def
|
261 |
-
#
|
262 |
-
|
263 |
|
264 |
-
#
|
265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
with gr.Blocks(css=css) as demo:
|
268 |
gr.HTML(f"""
|
269 |
<div class="header">
|
@@ -290,130 +493,70 @@ def create_leaderboard_interface():
|
|
290 |
</div>
|
291 |
</div>
|
292 |
""")
|
293 |
-
|
294 |
-
# Filter controls
|
295 |
-
with gr.Row(equal_height=True):
|
296 |
-
type_filter = gr.Radio(
|
297 |
-
["All", "Open-source", "Closed-source"],
|
298 |
-
label="Model Type",
|
299 |
-
value="All",
|
300 |
-
interactive=True
|
301 |
-
)
|
302 |
-
search_input = gr.Textbox(
|
303 |
-
label="Search Models, Organizations, or Tasks",
|
304 |
-
placeholder="Type to search...",
|
305 |
-
interactive=True
|
306 |
-
)
|
307 |
-
|
308 |
-
sort_by = gr.Dropdown(
|
309 |
-
["accuracy", "f1_score", "cer", "downloads"],
|
310 |
-
label="Sort by",
|
311 |
-
value="accuracy",
|
312 |
-
interactive=True
|
313 |
-
)
|
314 |
-
|
315 |
-
sort_order = gr.Radio(
|
316 |
-
["Descending", "Ascending"],
|
317 |
-
label="Sort Order",
|
318 |
-
value="Descending",
|
319 |
-
interactive=True
|
320 |
-
)
|
321 |
-
|
322 |
-
# Table output
|
323 |
-
table_output = gr.HTML()
|
324 |
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
|
|
339 |
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
<th>Task</th>
|
355 |
-
<th>Accuracy</th>
|
356 |
-
<th>F1 Score</th>
|
357 |
-
<th>CER</th>
|
358 |
-
<th>Downloads</th>
|
359 |
-
<th>Last Updated</th>
|
360 |
-
<th>Paper</th>
|
361 |
-
</tr>
|
362 |
-
</thead>
|
363 |
-
<tbody>
|
364 |
-
"""
|
365 |
-
|
366 |
-
for _, row in formatted_df.iterrows():
|
367 |
-
html_table += f"""
|
368 |
-
<tr>
|
369 |
-
<td>{row['model']}</td>
|
370 |
-
<td>{row['organization']}</td>
|
371 |
-
<td>{row['type']}</td>
|
372 |
-
<td>{row['task']}</td>
|
373 |
-
<td>{row['accuracy']}</td>
|
374 |
-
<td>{row['f1_score']}</td>
|
375 |
-
<td>{row['cer']}</td>
|
376 |
-
<td>{row['downloads']}</td>
|
377 |
-
<td>{row['last_updated']}</td>
|
378 |
-
<td>{row['paper']}</td>
|
379 |
-
</tr>
|
380 |
-
"""
|
381 |
-
|
382 |
-
html_table += """
|
383 |
-
</tbody>
|
384 |
-
</table>
|
385 |
-
</div>
|
386 |
-
<div class="footer">
|
387 |
-
<span>Showing {count} of {total} models</span>
|
388 |
-
<div>
|
389 |
-
<a href="https://github.com/mbzuai-oryx/KITAB-Bench" target="_blank">GitHub Repository</a>
|
390 |
-
<span style="margin: 0 8px;">|</span>
|
391 |
-
<a href="https://arxiv.org/abs/2502.14949" target="_blank">KITAB-Bench Paper</a>
|
392 |
-
</div>
|
393 |
-
</div>
|
394 |
-
""".format(count=len(filtered_df), total=len(df_orig))
|
395 |
-
|
396 |
-
return html_table
|
397 |
-
|
398 |
-
# Set up event handlers
|
399 |
-
type_filter.change(update_table, [type_filter, search_input, sort_by, sort_order], table_output)
|
400 |
-
search_input.change(update_table, [type_filter, search_input, sort_by, sort_order], table_output)
|
401 |
-
sort_by.change(update_table, [type_filter, search_input, sort_by, sort_order], table_output)
|
402 |
-
sort_order.change(update_table, [type_filter, search_input, sort_by, sort_order], table_output)
|
403 |
-
|
404 |
-
# Initialize table on page load
|
405 |
-
demo.load(update_table, [type_filter, search_input, sort_by, sort_order], table_output)
|
406 |
-
|
407 |
-
gr.HTML("""
|
408 |
-
<div style="margin-top: 20px; text-align: center; font-size: 0.8rem; color: #6B7280;">
|
409 |
-
<p>For more information about the KITAB-Bench, visit the <a href="https://mbzuai-oryx.github.io/KITAB-Bench/" target="_blank">project website</a>.</p>
|
410 |
-
</div>
|
411 |
-
""")
|
412 |
-
|
413 |
-
return demo
|
414 |
-
|
415 |
-
# Launch the app
|
416 |
-
demo = create_leaderboard_interface()
|
417 |
|
418 |
-
|
419 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
|
5 |
+
# Parse the provided data
|
6 |
+
data_str = """
|
7 |
+
Dataset Size GPT-4o GPT-4o-mini Gemini-2.0-Flash Qwen2-VL Qwen2.5-VL AIN Tesseract EasyOCR Paddle Surya Microsoft Qari Gemma3 ArabicNougat
|
8 |
+
Metrics CHrF CER WER CHrF CER WER CHrF CER WER CHrF CER WER CHrF CER WER CHrF CER WER CHrF CER WER CHrF CER WER ChrF CER WER ChrF CER WER ChrF CER WER ChrF CER WER ChrF CER WER ChrF CER WER
|
9 |
+
PATS 500 88.82 0.23 0.30 64.51 0.53 0.71 98.90 0.01 0.02 63.35 1.02 1.02 83.27 0.26 0.36 99.76 0.00 0.00 79.76 0.14 0.28 77.10 0.54 0.73 20.34 0.77 1.00 13.09 4.66 4.67 95.99 0.03 0.10 75.62 0.98 1.03 22.36 1.34 1.61 60.79 1.51 1.60
|
10 |
+
SythenAR 500 86.27 0.09 0.20 74.82 0.14 0.32 87.73 0.07 0.17 34.19 0.59 1.13 76.15 0.21 0.40 90.65 0.04 0.16 58.06 0.31 0.72 64.96 0.45 0.76 19.16 0.80 1.01 16.19 4.82 7.90 85.80 0.10 0.27 55.48 1.68 1.69 54.81 0.36 0.69 61.00 1.14 1.40
|
11 |
+
HistoryAr 200 38.99 0.51 0.82 23.90 0.67 0.96 56.37 0.28 0.64 13.99 3.46 2.86 40.52 0.47 0.83 58.23 0.26 0.54 18.15 0.72 1.25 37.56 0.46 0.97 13.91 0.79 1.01 5.02 10.32 12.78 58.81 0.24 0.68 14.92 3.48 3.39 17.92 1.07 1.46 10.09 2.72 2.93
|
12 |
+
HistoricalBooks 10 43.16 0.41 0.76 27.35 0.59 0.88 88.49 0.05 0.22 20.98 1.90 2.16 44.51 0.33 0.72 13.83 0.84 0.88 13.37 0.74 0.99 27.36 0.60 0.98 18.28 0.71 1.00 6.28 6.81 6.30 58.87 0.29 0.71 22.26 0.67 0.97 27.04 0.92 1.32 9.87 0.82 1.00
|
13 |
+
Khatt 200 45.44 0.45 0.74 27.97 0.64 0.91 67.09 0.19 0.45 28.41 1.12 0.88 27.25 5.04 5.19 89.13 0.07 0.22 20.56 0.61 1.14 25.09 0.67 1.06 14.86 0.76 1.00 13.35 4.25 3.77 15.15 0.83 0.92 27.26 1.60 1.80 18.84 0.89 1.22 16.60 1.46 1.86
|
14 |
+
Adab 200 51.08 0.30 0.73 43.28 0.35 0.83 64.00 0.19 0.56 20.44 0.63 1.10 29.45 0.68 1.08 99.59 0.00 0.01 23.45 1.00 1.00 29.47 1.00 1.00 8.79 0.88 1.15 0.08 7.28 8.71 0.78 0.99 0.99 31.47 0.91 1.11 23.93 0.50 1.01 5.80 7.47 9.35
|
15 |
+
Muharaf 200 25.70 0.56 0.90 20.86 0.63 0.94 47.16 0.33 0.69 8.01 3.57 2.87 22.75 0.61 0.96 67.50 0.38 0.54 12.28 0.77 1.28 16.06 0.70 1.02 11.41 0.80 1.01 5.99 6.19 7.48 32.12 0.52 0.82 8.70 2.40 2.74 16.18 0.77 1.17 7.74 1.83 2.37
|
16 |
+
OnlineKhatt 200 52.50 0.29 0.63 38.52 0.41 0.76 68.54 0.17 0.44 30.97 1.30 2.01 47.55 0.36 0.70 92.74 0.03 0.12 21.26 0.59 1.21 30.64 0.56 1.08 15.40 0.78 1.03 9.67 6.71 6.95 25.28 0.72 0.85 31.81 1.52 1.53 27.05 0.51 0.91 15.84 1.68 2.31
|
17 |
+
Khatt 200 45.44 0.45 0.74 27.97 0.64 0.91 67.09 0.19 0.45 28.41 1.12 0.88 27.25 5.04 5.19 89.13 0.07 0.22 20.56 0.61 1.14 25.09 0.67 1.06 14.86 0.76 1.00 13.35 4.25 3.77 15.15 0.83 0.92 27.26 1.60 1.80 18.84 0.89 1.22 16.60 1.46 1.86
|
18 |
+
ISI-PPT 500 89.96 0.08 0.18 79.44 0.15 0.31 90.45 0.06 0.15 55.48 1.03 1.01 73.15 0.36 0.54 52.42 0.52 0.53 68.32 0.31 0.43 59.80 0.55 0.77 18.63 0.81 1.03 33.34 2.75 3.58 2.53 0.98 0.98 34.36 1.27 1.39 16.69 0.82 1.46 46.98 1.95 2.30
|
19 |
+
ArabicOCR 50 83.47 0.06 0.26 70.21 0.16 0.46 98.79 0.00 0.02 58.87 1.25 1.51 63.84 1.00 1.00 99.26 0.00 0.01 98.99 0.01 0.02 75.84 0.56 0.76 26.49 0.77 1.00 80.93 0.15 0.20 99.38 0.01 0.11 94.89 0.02 0.08 51.06 0.53 0.79 83.58 0.18 0.34
|
20 |
+
Hindawi 200 60.13 0.34 0.56 43.20 0.48 0.71 97.77 0.01 0.04 22.56 1.82 2.05 24.31 1.00 1.00 89.89 0.11 0.15 61.36 0.31 0.50 64.88 0.40 0.72 22.04 0.76 1.00 66.42 0.26 0.42 89.75 0.06 0.28 67.05 0.27 0.42 36.48 0.63 0.87 65.11 0.24 0.51
|
21 |
+
EvArest 800 82.19 0.20 0.38 71.65 0.25 0.51 80.93 0.18 0.36 55.57 0.41 0.67 80.00 0.19 0.36 76.11 0.30 0.32 18.94 0.85 0.96 57.28 0.38 0.65 13.26 0.89 1.04 4.18 5.91 6.38 72.93 0.32 0.50 31.01 4.65 4.75 60.33 0.37 0.65 2.35 33.12 31.54
|
22 |
+
Average 3,760 61.01 0.31 0.55 47.21 0.43 0.71 77.95 0.13 0.32 33.94 1.48 1.55 49.23 1.20 1.41 78.33 0.20 0.28 39.62 0.54 0.84 45.47 0.58 0.89 16.73 0.79 1.02 20.61 4.95 5.61 50.97 0.52 0.69 39.77 1.80 1.93 30.02 1.05 1.45 30.52 4.37 4.67
|
23 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
+
# Process the data into a proper DataFrame
|
26 |
+
lines = data_str.strip().split('\n')
|
27 |
+
headers = lines[0].split('\t')
|
28 |
+
subheaders = lines[1].split('\t')
|
29 |
|
30 |
+
# Extract model names
|
31 |
+
model_names = []
|
32 |
+
current_model = ""
|
33 |
+
for i, header in enumerate(headers):
|
34 |
+
if i >= 2 and header: # Skip 'Dataset' and 'Size'
|
35 |
+
current_model = header
|
36 |
+
model_names.append(current_model)
|
37 |
+
|
38 |
+
# Create a processed dataset for the main leaderboard
|
39 |
+
models_data = []
|
40 |
+
for model in ["GPT-4o", "GPT-4o-mini", "Gemini-2.0-Flash", "Qwen2-VL", "Qwen2.5-VL",
|
41 |
+
"AIN", "Tesseract", "EasyOCR", "Paddle", "Surya", "Microsoft", "Qari",
|
42 |
+
"Gemma3", "ArabicNougat"]:
|
43 |
+
# Get the average metrics for each model from the last row
|
44 |
+
last_row = lines[-1].split('\t')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
+
# Find the column indices for this model
|
47 |
+
model_idx = -1
|
48 |
+
for i, header in enumerate(headers):
|
49 |
+
if header == model:
|
50 |
+
model_idx = i
|
51 |
+
break
|
52 |
|
53 |
+
if model_idx == -1:
|
54 |
+
# Try finding as a substring
|
55 |
+
for i, header in enumerate(headers):
|
56 |
+
if model in header:
|
57 |
+
model_idx = i
|
58 |
+
break
|
59 |
|
60 |
+
if model_idx != -1:
|
61 |
+
# Get CHrF, CER, WER
|
62 |
+
chrf_idx = model_idx
|
63 |
+
cer_idx = model_idx + 1
|
64 |
+
wer_idx = model_idx + 2
|
65 |
+
|
66 |
+
try:
|
67 |
+
# Parse metrics
|
68 |
+
chrf = float(last_row[chrf_idx]) if chrf_idx < len(last_row) else 0
|
69 |
+
cer = float(last_row[cer_idx]) if cer_idx < len(last_row) else 0
|
70 |
+
wer = float(last_row[wer_idx]) if wer_idx < len(last_row) else 0
|
71 |
+
|
72 |
+
# Determine model type
|
73 |
+
model_type = "Closed-source" if model in ["GPT-4o", "GPT-4o-mini", "Gemini-2.0-Flash", "Claude-3-Opus"] else "Open-source"
|
74 |
+
# Add framework category
|
75 |
+
if model in ["Tesseract", "EasyOCR", "Paddle", "Surya"]:
|
76 |
+
model_type = "Framework"
|
77 |
+
|
78 |
+
# Organize by organization
|
79 |
+
org_map = {
|
80 |
+
"GPT-4o": "OpenAI",
|
81 |
+
"GPT-4o-mini": "OpenAI",
|
82 |
+
"Gemini-2.0-Flash": "Google",
|
83 |
+
"Qwen2-VL": "Alibaba",
|
84 |
+
"Qwen2.5-VL": "Alibaba",
|
85 |
+
"AIN": "MBZUAI",
|
86 |
+
"Tesseract": "Google",
|
87 |
+
"EasyOCR": "JaidedAI",
|
88 |
+
"Paddle": "Baidu",
|
89 |
+
"Surya": "VikParuchuri",
|
90 |
+
"Microsoft": "Microsoft",
|
91 |
+
"Qari": "Sakana AI",
|
92 |
+
"Gemma3": "Google",
|
93 |
+
"ArabicNougat": "Arabic NLP"
|
94 |
+
}
|
95 |
+
|
96 |
+
organization = org_map.get(model, "Unknown")
|
97 |
+
|
98 |
+
# Generate download counts (this is simulated)
|
99 |
+
import random
|
100 |
+
downloads = f"{random.randint(10, 600)}K"
|
101 |
+
|
102 |
+
# Add to models data
|
103 |
+
models_data.append({
|
104 |
+
"model": model,
|
105 |
+
"organization": organization,
|
106 |
+
"type": model_type,
|
107 |
+
"task": "OCR/Arabic",
|
108 |
+
"metrics": {
|
109 |
+
"chrf": chrf,
|
110 |
+
"cer": cer,
|
111 |
+
"wer": wer
|
112 |
+
},
|
113 |
+
"downloads": downloads,
|
114 |
+
"last_updated": "2025-04-01",
|
115 |
+
"model_url": f"https://huggingface.co/{organization}/{model}",
|
116 |
+
"paper_url": "https://arxiv.org/abs/2502.14949",
|
117 |
+
})
|
118 |
+
except Exception as e:
|
119 |
+
print(f"Error processing {model}: {e}")
|
120 |
+
continue
|
121 |
+
|
122 |
+
# Create detailed dataset for per-dataset comparisons
|
123 |
+
dataset_names = []
|
124 |
+
dataset_sizes = []
|
125 |
+
dataset_metrics = {}
|
126 |
+
|
127 |
+
for i in range(2, len(lines)-1): # Skip headers and the average line
|
128 |
+
parts = lines[i].split('\t')
|
129 |
+
if len(parts) > 1:
|
130 |
+
dataset = parts[0]
|
131 |
+
size = parts[1] if len(parts) > 1 else "0"
|
132 |
+
|
133 |
+
dataset_names.append(dataset)
|
134 |
+
dataset_sizes.append(size)
|
135 |
+
|
136 |
+
metrics = {}
|
137 |
+
for j, model in enumerate(model_names):
|
138 |
+
base_idx = j*3 + 2 # Starting column for each model (+2 for Dataset and Size columns)
|
139 |
+
if base_idx + 2 < len(parts):
|
140 |
+
try:
|
141 |
+
chrf = float(parts[base_idx]) if parts[base_idx] else 0
|
142 |
+
cer = float(parts[base_idx + 1]) if parts[base_idx + 1] else 0
|
143 |
+
wer = float(parts[base_idx + 2]) if parts[base_idx + 2] else 0
|
144 |
+
metrics[model] = {
|
145 |
+
"chrf": chrf,
|
146 |
+
"cer": cer,
|
147 |
+
"wer": wer
|
148 |
+
}
|
149 |
+
except (ValueError, IndexError) as e:
|
150 |
+
print(f"Error parsing metrics for {dataset}, {model}: {e}")
|
151 |
+
metrics[model] = {"chrf": 0, "cer": 0, "wer": 0}
|
152 |
+
|
153 |
+
dataset_metrics[dataset] = metrics
|
154 |
|
155 |
# Define CSS for styling
|
156 |
css = """
|
|
|
167 |
.gradio-container {
|
168 |
max-width: 1200px !important;
|
169 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
.header {
|
171 |
background: linear-gradient(90deg, #FFDE59 0%, #FFC532 100%);
|
172 |
padding: 20px;
|
|
|
215 |
font-weight: 600;
|
216 |
color: #374151;
|
217 |
border-bottom: 1px solid #E5E7EB;
|
218 |
+
position: sticky;
|
219 |
+
top: 0;
|
220 |
+
z-index: 10;
|
221 |
}
|
222 |
td {
|
223 |
padding: 12px;
|
|
|
251 |
.footer a:hover {
|
252 |
text-decoration: underline;
|
253 |
}
|
254 |
+
.metric-table {
|
255 |
+
max-height: 600px;
|
256 |
+
overflow-y: auto;
|
257 |
+
}
|
258 |
+
.dataset-row:nth-child(odd) {
|
259 |
+
background-color: #F9FAFB;
|
260 |
+
}
|
261 |
+
.dataset-row:hover {
|
262 |
+
background-color: #EFF6FF;
|
263 |
+
}
|
264 |
+
.tab-active {
|
265 |
+
border-bottom: 2px solid #2563EB !important;
|
266 |
+
color: #2563EB !important;
|
267 |
+
font-weight: 600;
|
268 |
+
}
|
269 |
+
.metric-badge {
|
270 |
+
padding: 2px 8px;
|
271 |
+
border-radius: 9999px;
|
272 |
+
font-weight: 600;
|
273 |
+
font-size: 0.75rem;
|
274 |
+
display: inline-block;
|
275 |
+
}
|
276 |
+
.metric-good {
|
277 |
+
background-color: #DCFCE7;
|
278 |
+
color: #166534;
|
279 |
+
}
|
280 |
+
.metric-medium {
|
281 |
+
background-color: #FEF3C7;
|
282 |
+
color: #92400E;
|
283 |
+
}
|
284 |
+
.metric-poor {
|
285 |
+
background-color: #FEE2E2;
|
286 |
+
color: #B91C1C;
|
287 |
+
}
|
288 |
+
.chart-container {
|
289 |
+
margin-top: 20px;
|
290 |
+
overflow-x: auto;
|
291 |
+
}
|
292 |
"""
|
293 |
|
294 |
+
# Function to format metrics with color coding
|
295 |
+
def format_metric(metric_name, value):
|
296 |
+
if metric_name == "chrf":
|
297 |
+
if value > 75:
|
298 |
+
return f'<span class="metric-badge metric-good">{value:.1f}</span>'
|
299 |
+
elif value > 50:
|
300 |
+
return f'<span class="metric-badge metric-medium">{value:.1f}</span>'
|
301 |
+
else:
|
302 |
+
return f'<span class="metric-badge metric-poor">{value:.1f}</span>'
|
303 |
+
elif metric_name == "cer" or metric_name == "wer": # Lower is better
|
304 |
+
if value < 0.5:
|
305 |
+
return f'<span class="metric-badge metric-good">{value:.2f}</span>'
|
306 |
+
elif value < 1.0:
|
307 |
+
return f'<span class="metric-badge metric-medium">{value:.2f}</span>'
|
308 |
+
else:
|
309 |
+
return f'<span class="metric-badge metric-poor">{value:.2f}</span>'
|
310 |
+
return f"{value:.2f}"
|
311 |
|
312 |
+
# Function to filter models based on type
|
313 |
+
def filter_by_type(models, type_filter):
|
314 |
if type_filter == "All":
|
315 |
+
return models
|
316 |
+
return [model for model in models if model["type"] == type_filter]
|
317 |
|
318 |
+
# Function to filter models based on search term
|
319 |
+
def filter_by_search(models, search_term):
|
320 |
if not search_term:
|
321 |
+
return models
|
322 |
|
323 |
# Convert search term to lowercase for case-insensitive search
|
324 |
search_term = search_term.lower()
|
325 |
|
326 |
# Filter based on model, organization, or task
|
327 |
+
filtered_models = []
|
328 |
+
for model in models:
|
329 |
+
if (search_term in model["model"].lower() or
|
330 |
+
search_term in model["organization"].lower() or
|
331 |
+
search_term in model["task"].lower()):
|
332 |
+
filtered_models.append(model)
|
333 |
|
334 |
+
return filtered_models
|
335 |
|
336 |
+
# Function to generate the main leaderboard HTML
|
337 |
+
def generate_main_leaderboard(models, sort_by, sort_order):
|
338 |
+
# Sort models
|
339 |
+
reverse = sort_order == "Descending"
|
340 |
|
341 |
+
# Define key function for sorting based on metric
|
342 |
+
def get_sort_key(model):
|
343 |
+
if sort_by == "model" or sort_by == "organization" or sort_by == "type" or sort_by == "task":
|
344 |
+
return model[sort_by]
|
345 |
+
elif sort_by == "downloads":
|
346 |
+
# Extract numeric part from download string (e.g., "24.5K" -> 24.5)
|
347 |
+
try:
|
348 |
+
return float(model[sort_by].replace("K", ""))
|
349 |
+
except:
|
350 |
+
return 0
|
351 |
+
elif sort_by == "chrf" or sort_by == "cer" or sort_by == "wer":
|
352 |
+
return model["metrics"][sort_by]
|
353 |
+
return 0
|
354 |
+
|
355 |
+
# For CER and WER, lower is better so reverse the sort order
|
356 |
+
if sort_by in ["cer", "wer"]:
|
357 |
+
reverse = not reverse
|
358 |
+
|
359 |
+
sorted_models = sorted(models, key=get_sort_key, reverse=reverse)
|
360 |
+
|
361 |
+
html = """
|
362 |
+
<div style="overflow-x: auto;">
|
363 |
+
<table style="width:100%">
|
364 |
+
<thead>
|
365 |
+
<tr>
|
366 |
+
<th>Model</th>
|
367 |
+
<th>Organization</th>
|
368 |
+
<th>Type</th>
|
369 |
+
<th>Task</th>
|
370 |
+
<th>CHrF ↑</th>
|
371 |
+
<th>CER ↓</th>
|
372 |
+
<th>WER ↓</th>
|
373 |
+
<th>Downloads</th>
|
374 |
+
<th>Links</th>
|
375 |
+
</tr>
|
376 |
+
</thead>
|
377 |
+
<tbody>
|
378 |
+
"""
|
379 |
|
380 |
+
for model in sorted_models:
|
381 |
+
html += f"""
|
382 |
+
<tr>
|
383 |
+
<td>
|
384 |
+
<div style="font-weight: 500;">{model['model']}</div>
|
385 |
+
</td>
|
386 |
+
<td>{model['organization']}</td>
|
387 |
+
<td>
|
388 |
+
<span style="background-color: {'#DBEAFE' if model['type'] == 'Open-source' else '#FEF3C7' if model['type'] == 'Closed-source' else '#E0F2FE'};
|
389 |
+
padding: 2px 6px;
|
390 |
+
border-radius: 9999px;
|
391 |
+
font-size: 0.75rem;">
|
392 |
+
{model['type']}
|
393 |
+
</span>
|
394 |
+
</td>
|
395 |
+
<td>
|
396 |
+
<span style="background-color: #E0F2FE;
|
397 |
+
padding: 2px 6px;
|
398 |
+
border-radius: 9999px;
|
399 |
+
font-size: 0.75rem;">
|
400 |
+
{model['task']}
|
401 |
+
</span>
|
402 |
+
</td>
|
403 |
+
<td>{format_metric('chrf', model['metrics']['chrf'])}</td>
|
404 |
+
<td>{format_metric('cer', model['metrics']['cer'])}</td>
|
405 |
+
<td>{format_metric('wer', model['metrics']['wer'])}</td>
|
406 |
+
<td>{model['downloads']}</td>
|
407 |
+
<td>
|
408 |
+
<a href="{model['model_url']}" target="_blank">Model</a> |
|
409 |
+
<a href="{model['paper_url']}" target="_blank">Paper</a>
|
410 |
+
</td>
|
411 |
+
</tr>
|
412 |
+
"""
|
413 |
+
|
414 |
+
html += """
|
415 |
+
</tbody>
|
416 |
+
</table>
|
417 |
+
</div>
|
418 |
+
"""
|
419 |
+
|
420 |
+
return html
|
421 |
+
|
422 |
+
# Function to generate per-dataset comparison HTML
|
423 |
+
def generate_dataset_comparison(selected_datasets, selected_models, metric):
|
424 |
+
html = f"""
|
425 |
+
<div class="metric-table">
|
426 |
+
<table style="width:100%">
|
427 |
+
<thead>
|
428 |
+
<tr>
|
429 |
+
<th>Dataset</th>
|
430 |
+
<th>Size</th>
|
431 |
+
"""
|
432 |
+
|
433 |
+
for model in selected_models:
|
434 |
+
html += f"<th>{model}</th>"
|
435 |
+
|
436 |
+
html += """
|
437 |
+
</tr>
|
438 |
+
</thead>
|
439 |
+
<tbody>
|
440 |
+
"""
|
441 |
+
|
442 |
+
for dataset_idx, dataset in enumerate(selected_datasets):
|
443 |
+
size = dataset_sizes[dataset_names.index(dataset)]
|
444 |
+
|
445 |
+
html += f"""
|
446 |
+
<tr class="dataset-row">
|
447 |
+
<td style="font-weight: 500;">{dataset}</td>
|
448 |
+
<td>{size}</td>
|
449 |
+
"""
|
450 |
+
|
451 |
+
for model in selected_models:
|
452 |
+
if model in dataset_metrics[dataset]:
|
453 |
+
value = dataset_metrics[dataset][model][metric.lower()]
|
454 |
+
html += f"<td>{format_metric(metric.lower(), value)}</td>"
|
455 |
+
else:
|
456 |
+
html += "<td>-</td>"
|
457 |
+
|
458 |
+
html += "</tr>"
|
459 |
+
|
460 |
+
html += """
|
461 |
+
</tbody>
|
462 |
+
</table>
|
463 |
+
</div>
|
464 |
+
"""
|
465 |
+
|
466 |
+
return html
|
467 |
+
|
468 |
+
# Create the Gradio interface
|
469 |
+
def create_leaderboard_interface():
|
470 |
with gr.Blocks(css=css) as demo:
|
471 |
gr.HTML(f"""
|
472 |
<div class="header">
|
|
|
493 |
</div>
|
494 |
</div>
|
495 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
496 |
|
497 |
+
with gr.Tabs() as tabs:
|
498 |
+
with gr.TabItem("Main Leaderboard", id=0):
|
499 |
+
# Filter controls
|
500 |
+
with gr.Row(equal_height=True):
|
501 |
+
type_filter = gr.Radio(
|
502 |
+
["All", "Open-source", "Closed-source", "Framework"],
|
503 |
+
label="Model Type",
|
504 |
+
value="All",
|
505 |
+
interactive=True
|
506 |
+
)
|
507 |
+
search_input = gr.Textbox(
|
508 |
+
label="Search Models, Organizations, or Tasks",
|
509 |
+
placeholder="Type to search...",
|
510 |
+
interactive=True
|
511 |
+
)
|
512 |
|
513 |
+
with gr.Row(equal_height=True):
|
514 |
+
sort_by = gr.Dropdown(
|
515 |
+
["model", "organization", "type", "chrf", "cer", "wer", "downloads"],
|
516 |
+
label="Sort by",
|
517 |
+
value="chrf",
|
518 |
+
interactive=True
|
519 |
+
)
|
520 |
+
|
521 |
+
sort_order = gr.Radio(
|
522 |
+
["Descending", "Ascending"],
|
523 |
+
label="Sort Order",
|
524 |
+
value="Descending",
|
525 |
+
interactive=True
|
526 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
527 |
|
528 |
+
# Table output
|
529 |
+
leaderboard_output = gr.HTML()
|
530 |
+
|
531 |
+
# Update function for the main leaderboard
|
532 |
+
def update_leaderboard(type_filter, search_term, sort_by, sort_order):
|
533 |
+
filtered_models = filter_by_type(models_data, type_filter)
|
534 |
+
filtered_models = filter_by_search(filtered_models, search_term)
|
535 |
+
html = generate_main_leaderboard(filtered_models, sort_by, sort_order)
|
536 |
+
|
537 |
+
footer = f"""
|
538 |
+
<div class="footer">
|
539 |
+
<span>Showing {len(filtered_models)} of {len(models_data)} models</span>
|
540 |
+
<div>
|
541 |
+
<a href="https://github.com/mbzuai-oryx/KITAB-Bench" target="_blank">GitHub Repository</a>
|
542 |
+
<span style="margin: 0 8px;">|</span>
|
543 |
+
<a href="https://arxiv.org/abs/2502.14949" target="_blank">KITAB-Bench Paper</a>
|
544 |
+
</div>
|
545 |
+
</div>
|
546 |
+
"""
|
547 |
+
|
548 |
+
return html + footer
|
549 |
+
|
550 |
+
# Set up event handlers for main leaderboard
|
551 |
+
type_filter.change(update_leaderboard, [type_filter, search_input, sort_by, sort_order], leaderboard_output)
|
552 |
+
search_input.change(update_leaderboard, [type_filter, search_input, sort_by, sort_order], leaderboard_output)
|
553 |
+
sort_by.change(update_leaderboard, [type_filter, search_input, sort_by, sort_order], leaderboard_output)
|
554 |
+
sort_order.change(update_leaderboard, [type_filter, search_input, sort_by, sort_order], leaderboard_output)
|
555 |
+
|
556 |
+
with gr.TabItem("Dataset Comparison", id=1):
|
557 |
+
with gr.Row():
|
558 |
+
dataset_selector = gr.CheckboxGroup(
|
559 |
+
dataset_names,
|
560 |
+
label="Select Datasets",
|
561 |
+
value=dataset_names[:5], # Default to first 5 datasets
|
562 |
+
interactive=True)
|