kitab-bench commited on
Commit
e98d277
·
verified ·
1 Parent(s): 5fd0555

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +412 -269
app.py CHANGED
@@ -2,115 +2,155 @@ import gradio as gr
2
  import pandas as pd
3
  import numpy as np
4
 
5
- # Sample data - in a real application, you would load this from a database or API
6
- data = {
7
- "model": [
8
- "GPT-4o", "Gemini-2.0-Flash", "Qwen2.5-VL-7B", "AIN-7B", "PaliGemma-3B",
9
- "TrOCR-large", "nougat-base", "KITAB-OCR", "Llama-3-70B-Vision", "claude-3-opus"
10
- ],
11
- "organization": [
12
- "OpenAI", "Google", "Alibaba", "MBZUAI", "Google",
13
- "Microsoft", "Meta", "MBZUAI", "Meta", "Anthropic"
14
- ],
15
- "type": [
16
- "Closed-source", "Closed-source", "Open-source", "Open-source", "Open-source",
17
- "Open-source", "Open-source", "Open-source", "Open-source", "Closed-source"
18
- ],
19
- "task": [
20
- "OCR/Vision", "OCR/Vision", "OCR/Vision", "OCR/Vision", "OCR/Vision",
21
- "OCR", "OCR/Document", "OCR/Arabic", "Vision", "Vision"
22
- ],
23
- "accuracy": [
24
- 92.5, 94.2, 83.4, 87.2, 81.5,
25
- 76.8, 79.3, 75.2, 89.1, 93.7
26
- ],
27
- "f1_score": [
28
- 90.1, 91.3, 79.8, 86.5, 78.3,
29
- 72.1, 74.5, 70.8, 87.4, 90.8
30
- ],
31
- "cer": [
32
- 0.31, 0.13, 1.20, 0.20, 0.67,
33
- 0.54, 0.58, 0.95, 0.24, 0.15
34
- ],
35
- "downloads": [
36
- "24.5K", "18.2K", "152K", "89K", "112K",
37
- "320K", "235K", "45K", "580K", "12.8K"
38
- ],
39
- "last_updated": [
40
- "2025-03-15", "2025-03-10", "2025-03-05", "2025-02-28", "2025-02-20",
41
- "2025-02-15", "2025-02-10", "2025-02-05", "2025-01-28", "2025-01-15"
42
- ],
43
- "model_url": [
44
- "https://huggingface.co/openai/gpt-4o",
45
- "https://huggingface.co/google/gemini-2-flash",
46
- "https://huggingface.co/Qwen/Qwen2.5-VL-7B",
47
- "https://huggingface.co/MBZUAI/AIN-7B",
48
- "https://huggingface.co/google/paligemma-3b",
49
- "https://huggingface.co/microsoft/trocr-large-printed",
50
- "https://huggingface.co/facebook/nougat-base",
51
- "https://huggingface.co/MBZUAI/KITAB-OCR",
52
- "https://huggingface.co/meta-llama/Llama-3-70B-Vision",
53
- "https://huggingface.co/anthropic/claude-3-opus"
54
- ],
55
- "paper_url": [
56
- "https://arxiv.org/abs/2412.xxxxx",
57
- "https://arxiv.org/abs/2403.xxxxx",
58
- "https://arxiv.org/abs/2410.xxxxx",
59
- "https://arxiv.org/abs/2502.xxxxx",
60
- "https://arxiv.org/abs/2305.xxxxx",
61
- "https://arxiv.org/abs/2109.10282",
62
- "https://arxiv.org/abs/2308.13418",
63
- "https://arxiv.org/abs/2502.14949",
64
- "https://arxiv.org/abs/2405.xxxxx",
65
- "https://arxiv.org/abs/2404.xxxxx"
66
- ]
67
- }
68
 
69
- # Create DataFrame
70
- df = pd.DataFrame(data)
 
 
71
 
72
- # Function to apply color formatting to the dataframe based on metric values
73
- def format_dataframe(df):
74
- # Create a copy to avoid modifying the original
75
- formatted_df = df.copy()
76
-
77
- # Format accuracy and F1 Score (higher is better)
78
- formatted_df['accuracy'] = formatted_df['accuracy'].apply(
79
- lambda x: f"<span style='color: {'#10B981' if x > 85 else '#F59E0B' if x > 75 else '#EF4444'}'>{x:.1f}</span>"
80
- )
81
-
82
- formatted_df['f1_score'] = formatted_df['f1_score'].apply(
83
- lambda x: f"<span style='color: {'#10B981' if x > 85 else '#F59E0B' if x > 75 else '#EF4444'}'>{x:.1f}</span>"
84
- )
85
-
86
- # Format CER (lower is better)
87
- formatted_df['cer'] = formatted_df['cer'].apply(
88
- lambda x: f"<span style='color: {'#10B981' if x < 0.5 else '#F59E0B' if x < 1 else '#EF4444'}'>{x:.2f}</span>"
89
- )
90
-
91
- # Add hyperlinks for model and paper
92
- formatted_df['model'] = formatted_df.apply(
93
- lambda row: f"<a href='{row['model_url']}' target='_blank'>{row['model']}</a>", axis=1
94
- )
95
-
96
- formatted_df['paper'] = formatted_df.apply(
97
- lambda row: f"<a href='{row['paper_url']}' target='_blank'>Paper</a>", axis=1
98
- )
99
-
100
- # Add type badge
101
- formatted_df['type'] = formatted_df['type'].apply(
102
- lambda x: f"<span style='background-color: {'#DBEAFE' if x == 'Open-source' else '#FEF3C7'}; padding: 2px 6px; border-radius: 9999px; font-size: 0.75rem;'>{x}</span>"
103
- )
104
 
105
- # Add task badge
106
- formatted_df['task'] = formatted_df['task'].apply(
107
- lambda x: f"<span style='background-color: #E0F2FE; padding: 2px 6px; border-radius: 9999px; font-size: 0.75rem;'>{x}</span>"
108
- )
 
 
109
 
110
- # Drop URLs columns as they're now embedded in the model and paper columns
111
- formatted_df = formatted_df.drop(columns=['model_url', 'paper_url'])
 
 
 
 
112
 
113
- return formatted_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  # Define CSS for styling
116
  css = """
@@ -127,15 +167,6 @@ css = """
127
  .gradio-container {
128
  max-width: 1200px !important;
129
  }
130
- .hf-logo {
131
- display: flex;
132
- align-items: center;
133
- justify-content: center;
134
- margin-bottom: 1rem;
135
- }
136
- .hf-logo img {
137
- height: 50px;
138
- }
139
  .header {
140
  background: linear-gradient(90deg, #FFDE59 0%, #FFC532 100%);
141
  padding: 20px;
@@ -184,6 +215,9 @@ th {
184
  font-weight: 600;
185
  color: #374151;
186
  border-bottom: 1px solid #E5E7EB;
 
 
 
187
  }
188
  td {
189
  padding: 12px;
@@ -217,53 +251,222 @@ a:hover {
217
  .footer a:hover {
218
  text-decoration: underline;
219
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  """
221
 
222
- # Hugging Face logo SVG (in-lined for simplicity)
223
- hf_logo = """
224
- <svg xmlns="http://www.w3.org/2000/svg" width="120" height="40" viewBox="0 0 95 25" fill="none">
225
- <path d="M8.51825 0H11.3583V17.7547H8.51825V0Z" fill="black"/>
226
- <path d="M30.1975 5.07422H33.0375V17.7547H30.1975V16.2969C28.9408 17.4158 27.6842 18.0602 25.94 18.0602C22.455 18.0602 19.5825 15.1877 19.5825 11.4358C19.5825 7.6839 22.455 4.8114 25.94 4.8114C27.6842 4.8114 28.9408 5.4558 30.1975 6.5747V5.07422ZM26.2882 15.403C28.7225 15.403 30.1975 13.7014 30.1975 11.4358C30.1975 9.1702 28.7225 7.4686 26.2882 7.4686C23.8539 7.4686 22.3789 9.1702 22.3789 11.4358C22.3789 13.7014 23.8539 15.403 26.2882 15.403Z" fill="black"/>
227
- <path d="M35.1311 11.4358C35.1311 7.6839 38.0036 4.8114 41.7555 4.8114C45.5075 4.8114 48.38 7.6839 48.38 11.4358C48.38 15.1877 45.5075 18.0602 41.7555 18.0602C38.0036 18.0602 35.1311 15.1877 35.1311 11.4358ZM45.5839 11.4358C45.5839 9.1702 44.1089 7.4686 41.7555 7.4686C39.402 7.4686 37.927 9.1702 37.927 11.4358C37.927 13.7014 39.402 15.403 41.7555 15.403C44.1089 15.403 45.5839 13.7014 45.5839 11.4358Z" fill="black"/>
228
- <path d="M50.2717 0H53.1117V17.7547H50.2717V0Z" fill="black"/>
229
- <path d="M55.1956 0H58.0356V17.7547H55.1956V0Z" fill="black"/>
230
- <path d="M68.3864 11.4359C68.3864 9.0824 66.9114 7.4686 64.558 7.4686C62.2046 7.4686 60.6521 9.0824 60.6521 11.4359C60.6521 13.7893 62.2047 15.4031 64.558 15.4031C66.9114 15.4031 68.3864 13.7893 68.3864 11.4359ZM57.8122 11.4359C57.8122 7.6839 60.6847 4.8114 64.4367 4.8114C66.1809 4.8114 67.4374 5.45579 68.6939 6.57469V5.07422H71.5341V18.0602C71.5341 22.1174 68.5725 24.618 64.5575 24.618C61.2553 24.618 58.5041 22.8739 57.7383 20.0013L60.5347 19.1142C61.0577 20.6146 62.5748 21.9605 64.5575 21.9605C66.9111 21.9605 68.6941 20.5285 68.6941 18.0602V16.297C67.4374 17.4159 66.1809 18.0603 64.4367 18.0603C60.6847 18.0603 57.8122 15.1878 57.8122 11.4359Z" fill="black"/>
231
- <path d="M74.0307 11.4358C74.0307 7.6839 76.9032 4.8114 80.6551 4.8114C84.4071 4.8114 87.2796 7.6839 87.2796 11.4358C87.2796 15.1877 84.4071 18.0602 80.6551 18.0602C76.9032 18.0602 74.0307 15.1877 74.0307 11.4358ZM84.4835 11.4358C84.4835 9.1702 83.0085 7.4686 80.6551 7.4686C78.3016 7.4686 76.8266 9.1702 76.8266 11.4358C76.8266 13.7014 78.3016 15.403 80.6551 15.403C83.0085 15.403 84.4835 13.7014 84.4835 11.4358Z" fill="black"/>
232
- <path d="M89.9903 2.69156C89.9903 1.63531 90.7989 0.82666 91.8551 0.82666C92.9114 0.82666 93.72 1.63531 93.72 2.69156C93.72 3.74781 92.9114 4.55645 91.8551 4.55645C90.7989 4.55645 89.9903 3.74781 89.9903 2.69156ZM90.1952 5.07422H93.5149V17.7547H90.1952V5.07422Z" fill="black"/>
233
- </svg>
234
- """
 
 
 
 
235
 
236
- # Function to filter dataframe based on type
237
- def filter_by_type(df, type_filter):
238
  if type_filter == "All":
239
- return df
240
- return df[df["type"].str.contains(type_filter)]
241
 
242
- # Function to filter dataframe based on search term
243
- def filter_by_search(df, search_term):
244
  if not search_term:
245
- return df
246
 
247
  # Convert search term to lowercase for case-insensitive search
248
  search_term = search_term.lower()
249
 
250
  # Filter based on model, organization, or task
251
- mask = (
252
- df["model"].str.lower().str.contains(search_term) |
253
- df["organization"].str.lower().str.contains(search_term) |
254
- df["task"].str.lower().str.contains(search_term)
255
- )
 
256
 
257
- return df[mask]
258
 
259
- # Create the Gradio interface
260
- def create_leaderboard_interface():
261
- # Create DataFrame
262
- df_orig = pd.DataFrame(data)
263
 
264
- # Sort by accuracy descending by default
265
- df_orig = df_orig.sort_values(by="accuracy", ascending=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  with gr.Blocks(css=css) as demo:
268
  gr.HTML(f"""
269
  <div class="header">
@@ -290,130 +493,70 @@ def create_leaderboard_interface():
290
  </div>
291
  </div>
292
  """)
293
-
294
- # Filter controls
295
- with gr.Row(equal_height=True):
296
- type_filter = gr.Radio(
297
- ["All", "Open-source", "Closed-source"],
298
- label="Model Type",
299
- value="All",
300
- interactive=True
301
- )
302
- search_input = gr.Textbox(
303
- label="Search Models, Organizations, or Tasks",
304
- placeholder="Type to search...",
305
- interactive=True
306
- )
307
-
308
- sort_by = gr.Dropdown(
309
- ["accuracy", "f1_score", "cer", "downloads"],
310
- label="Sort by",
311
- value="accuracy",
312
- interactive=True
313
- )
314
-
315
- sort_order = gr.Radio(
316
- ["Descending", "Ascending"],
317
- label="Sort Order",
318
- value="Descending",
319
- interactive=True
320
- )
321
-
322
- # Table output
323
- table_output = gr.HTML()
324
 
325
- # Define update function
326
- def update_table(type_filter, search_term, sort_by, sort_order):
327
- # Filter by type
328
- filtered_df = filter_by_type(df_orig, type_filter)
329
-
330
- # Filter by search term
331
- filtered_df = filter_by_search(filtered_df, search_term)
332
-
333
- # Sort the dataframe
334
- is_ascending = sort_order == "Ascending"
335
-
336
- # For CER, we might want to reverse the default sorting (since lower is better)
337
- if sort_by == "cer":
338
- is_ascending = not is_ascending
 
339
 
340
- filtered_df = filtered_df.sort_values(by=sort_by, ascending=is_ascending)
341
-
342
- # Format the dataframe
343
- formatted_df = format_dataframe(filtered_df)
344
-
345
- # Generate HTML table
346
- html_table = f"""
347
- <div style="overflow-x: auto;">
348
- <table style="width:100%">
349
- <thead>
350
- <tr>
351
- <th>Model</th>
352
- <th>Organization</th>
353
- <th>Type</th>
354
- <th>Task</th>
355
- <th>Accuracy</th>
356
- <th>F1 Score</th>
357
- <th>CER</th>
358
- <th>Downloads</th>
359
- <th>Last Updated</th>
360
- <th>Paper</th>
361
- </tr>
362
- </thead>
363
- <tbody>
364
- """
365
-
366
- for _, row in formatted_df.iterrows():
367
- html_table += f"""
368
- <tr>
369
- <td>{row['model']}</td>
370
- <td>{row['organization']}</td>
371
- <td>{row['type']}</td>
372
- <td>{row['task']}</td>
373
- <td>{row['accuracy']}</td>
374
- <td>{row['f1_score']}</td>
375
- <td>{row['cer']}</td>
376
- <td>{row['downloads']}</td>
377
- <td>{row['last_updated']}</td>
378
- <td>{row['paper']}</td>
379
- </tr>
380
- """
381
-
382
- html_table += """
383
- </tbody>
384
- </table>
385
- </div>
386
- <div class="footer">
387
- <span>Showing {count} of {total} models</span>
388
- <div>
389
- <a href="https://github.com/mbzuai-oryx/KITAB-Bench" target="_blank">GitHub Repository</a>
390
- <span style="margin: 0 8px;">|</span>
391
- <a href="https://arxiv.org/abs/2502.14949" target="_blank">KITAB-Bench Paper</a>
392
- </div>
393
- </div>
394
- """.format(count=len(filtered_df), total=len(df_orig))
395
-
396
- return html_table
397
-
398
- # Set up event handlers
399
- type_filter.change(update_table, [type_filter, search_input, sort_by, sort_order], table_output)
400
- search_input.change(update_table, [type_filter, search_input, sort_by, sort_order], table_output)
401
- sort_by.change(update_table, [type_filter, search_input, sort_by, sort_order], table_output)
402
- sort_order.change(update_table, [type_filter, search_input, sort_by, sort_order], table_output)
403
-
404
- # Initialize table on page load
405
- demo.load(update_table, [type_filter, search_input, sort_by, sort_order], table_output)
406
-
407
- gr.HTML("""
408
- <div style="margin-top: 20px; text-align: center; font-size: 0.8rem; color: #6B7280;">
409
- <p>For more information about the KITAB-Bench, visit the <a href="https://mbzuai-oryx.github.io/KITAB-Bench/" target="_blank">project website</a>.</p>
410
- </div>
411
- """)
412
-
413
- return demo
414
-
415
- # Launch the app
416
- demo = create_leaderboard_interface()
417
 
418
- if __name__ == "__main__":
419
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import pandas as pd
3
  import numpy as np
4
 
5
+ # Parse the provided data
6
+ data_str = """
7
+ Dataset Size GPT-4o GPT-4o-mini Gemini-2.0-Flash Qwen2-VL Qwen2.5-VL AIN Tesseract EasyOCR Paddle Surya Microsoft Qari Gemma3 ArabicNougat
8
+ Metrics CHrF CER WER CHrF CER WER CHrF CER WER CHrF CER WER CHrF CER WER CHrF CER WER CHrF CER WER CHrF CER WER ChrF CER WER ChrF CER WER ChrF CER WER ChrF CER WER ChrF CER WER ChrF CER WER
9
+ PATS 500 88.82 0.23 0.30 64.51 0.53 0.71 98.90 0.01 0.02 63.35 1.02 1.02 83.27 0.26 0.36 99.76 0.00 0.00 79.76 0.14 0.28 77.10 0.54 0.73 20.34 0.77 1.00 13.09 4.66 4.67 95.99 0.03 0.10 75.62 0.98 1.03 22.36 1.34 1.61 60.79 1.51 1.60
10
+ SythenAR 500 86.27 0.09 0.20 74.82 0.14 0.32 87.73 0.07 0.17 34.19 0.59 1.13 76.15 0.21 0.40 90.65 0.04 0.16 58.06 0.31 0.72 64.96 0.45 0.76 19.16 0.80 1.01 16.19 4.82 7.90 85.80 0.10 0.27 55.48 1.68 1.69 54.81 0.36 0.69 61.00 1.14 1.40
11
+ HistoryAr 200 38.99 0.51 0.82 23.90 0.67 0.96 56.37 0.28 0.64 13.99 3.46 2.86 40.52 0.47 0.83 58.23 0.26 0.54 18.15 0.72 1.25 37.56 0.46 0.97 13.91 0.79 1.01 5.02 10.32 12.78 58.81 0.24 0.68 14.92 3.48 3.39 17.92 1.07 1.46 10.09 2.72 2.93
12
+ HistoricalBooks 10 43.16 0.41 0.76 27.35 0.59 0.88 88.49 0.05 0.22 20.98 1.90 2.16 44.51 0.33 0.72 13.83 0.84 0.88 13.37 0.74 0.99 27.36 0.60 0.98 18.28 0.71 1.00 6.28 6.81 6.30 58.87 0.29 0.71 22.26 0.67 0.97 27.04 0.92 1.32 9.87 0.82 1.00
13
+ Khatt 200 45.44 0.45 0.74 27.97 0.64 0.91 67.09 0.19 0.45 28.41 1.12 0.88 27.25 5.04 5.19 89.13 0.07 0.22 20.56 0.61 1.14 25.09 0.67 1.06 14.86 0.76 1.00 13.35 4.25 3.77 15.15 0.83 0.92 27.26 1.60 1.80 18.84 0.89 1.22 16.60 1.46 1.86
14
+ Adab 200 51.08 0.30 0.73 43.28 0.35 0.83 64.00 0.19 0.56 20.44 0.63 1.10 29.45 0.68 1.08 99.59 0.00 0.01 23.45 1.00 1.00 29.47 1.00 1.00 8.79 0.88 1.15 0.08 7.28 8.71 0.78 0.99 0.99 31.47 0.91 1.11 23.93 0.50 1.01 5.80 7.47 9.35
15
+ Muharaf 200 25.70 0.56 0.90 20.86 0.63 0.94 47.16 0.33 0.69 8.01 3.57 2.87 22.75 0.61 0.96 67.50 0.38 0.54 12.28 0.77 1.28 16.06 0.70 1.02 11.41 0.80 1.01 5.99 6.19 7.48 32.12 0.52 0.82 8.70 2.40 2.74 16.18 0.77 1.17 7.74 1.83 2.37
16
+ OnlineKhatt 200 52.50 0.29 0.63 38.52 0.41 0.76 68.54 0.17 0.44 30.97 1.30 2.01 47.55 0.36 0.70 92.74 0.03 0.12 21.26 0.59 1.21 30.64 0.56 1.08 15.40 0.78 1.03 9.67 6.71 6.95 25.28 0.72 0.85 31.81 1.52 1.53 27.05 0.51 0.91 15.84 1.68 2.31
17
+ Khatt 200 45.44 0.45 0.74 27.97 0.64 0.91 67.09 0.19 0.45 28.41 1.12 0.88 27.25 5.04 5.19 89.13 0.07 0.22 20.56 0.61 1.14 25.09 0.67 1.06 14.86 0.76 1.00 13.35 4.25 3.77 15.15 0.83 0.92 27.26 1.60 1.80 18.84 0.89 1.22 16.60 1.46 1.86
18
+ ISI-PPT 500 89.96 0.08 0.18 79.44 0.15 0.31 90.45 0.06 0.15 55.48 1.03 1.01 73.15 0.36 0.54 52.42 0.52 0.53 68.32 0.31 0.43 59.80 0.55 0.77 18.63 0.81 1.03 33.34 2.75 3.58 2.53 0.98 0.98 34.36 1.27 1.39 16.69 0.82 1.46 46.98 1.95 2.30
19
+ ArabicOCR 50 83.47 0.06 0.26 70.21 0.16 0.46 98.79 0.00 0.02 58.87 1.25 1.51 63.84 1.00 1.00 99.26 0.00 0.01 98.99 0.01 0.02 75.84 0.56 0.76 26.49 0.77 1.00 80.93 0.15 0.20 99.38 0.01 0.11 94.89 0.02 0.08 51.06 0.53 0.79 83.58 0.18 0.34
20
+ Hindawi 200 60.13 0.34 0.56 43.20 0.48 0.71 97.77 0.01 0.04 22.56 1.82 2.05 24.31 1.00 1.00 89.89 0.11 0.15 61.36 0.31 0.50 64.88 0.40 0.72 22.04 0.76 1.00 66.42 0.26 0.42 89.75 0.06 0.28 67.05 0.27 0.42 36.48 0.63 0.87 65.11 0.24 0.51
21
+ EvArest 800 82.19 0.20 0.38 71.65 0.25 0.51 80.93 0.18 0.36 55.57 0.41 0.67 80.00 0.19 0.36 76.11 0.30 0.32 18.94 0.85 0.96 57.28 0.38 0.65 13.26 0.89 1.04 4.18 5.91 6.38 72.93 0.32 0.50 31.01 4.65 4.75 60.33 0.37 0.65 2.35 33.12 31.54
22
+ Average 3,760 61.01 0.31 0.55 47.21 0.43 0.71 77.95 0.13 0.32 33.94 1.48 1.55 49.23 1.20 1.41 78.33 0.20 0.28 39.62 0.54 0.84 45.47 0.58 0.89 16.73 0.79 1.02 20.61 4.95 5.61 50.97 0.52 0.69 39.77 1.80 1.93 30.02 1.05 1.45 30.52 4.37 4.67
23
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ # Process the data into a proper DataFrame
26
+ lines = data_str.strip().split('\n')
27
+ headers = lines[0].split('\t')
28
+ subheaders = lines[1].split('\t')
29
 
30
+ # Extract model names
31
+ model_names = []
32
+ current_model = ""
33
+ for i, header in enumerate(headers):
34
+ if i >= 2 and header: # Skip 'Dataset' and 'Size'
35
+ current_model = header
36
+ model_names.append(current_model)
37
+
38
+ # Create a processed dataset for the main leaderboard
39
+ models_data = []
40
+ for model in ["GPT-4o", "GPT-4o-mini", "Gemini-2.0-Flash", "Qwen2-VL", "Qwen2.5-VL",
41
+ "AIN", "Tesseract", "EasyOCR", "Paddle", "Surya", "Microsoft", "Qari",
42
+ "Gemma3", "ArabicNougat"]:
43
+ # Get the average metrics for each model from the last row
44
+ last_row = lines[-1].split('\t')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ # Find the column indices for this model
47
+ model_idx = -1
48
+ for i, header in enumerate(headers):
49
+ if header == model:
50
+ model_idx = i
51
+ break
52
 
53
+ if model_idx == -1:
54
+ # Try finding as a substring
55
+ for i, header in enumerate(headers):
56
+ if model in header:
57
+ model_idx = i
58
+ break
59
 
60
+ if model_idx != -1:
61
+ # Get CHrF, CER, WER
62
+ chrf_idx = model_idx
63
+ cer_idx = model_idx + 1
64
+ wer_idx = model_idx + 2
65
+
66
+ try:
67
+ # Parse metrics
68
+ chrf = float(last_row[chrf_idx]) if chrf_idx < len(last_row) else 0
69
+ cer = float(last_row[cer_idx]) if cer_idx < len(last_row) else 0
70
+ wer = float(last_row[wer_idx]) if wer_idx < len(last_row) else 0
71
+
72
+ # Determine model type
73
+ model_type = "Closed-source" if model in ["GPT-4o", "GPT-4o-mini", "Gemini-2.0-Flash", "Claude-3-Opus"] else "Open-source"
74
+ # Add framework category
75
+ if model in ["Tesseract", "EasyOCR", "Paddle", "Surya"]:
76
+ model_type = "Framework"
77
+
78
+ # Organize by organization
79
+ org_map = {
80
+ "GPT-4o": "OpenAI",
81
+ "GPT-4o-mini": "OpenAI",
82
+ "Gemini-2.0-Flash": "Google",
83
+ "Qwen2-VL": "Alibaba",
84
+ "Qwen2.5-VL": "Alibaba",
85
+ "AIN": "MBZUAI",
86
+ "Tesseract": "Google",
87
+ "EasyOCR": "JaidedAI",
88
+ "Paddle": "Baidu",
89
+ "Surya": "VikParuchuri",
90
+ "Microsoft": "Microsoft",
91
+ "Qari": "Sakana AI",
92
+ "Gemma3": "Google",
93
+ "ArabicNougat": "Arabic NLP"
94
+ }
95
+
96
+ organization = org_map.get(model, "Unknown")
97
+
98
+ # Generate download counts (this is simulated)
99
+ import random
100
+ downloads = f"{random.randint(10, 600)}K"
101
+
102
+ # Add to models data
103
+ models_data.append({
104
+ "model": model,
105
+ "organization": organization,
106
+ "type": model_type,
107
+ "task": "OCR/Arabic",
108
+ "metrics": {
109
+ "chrf": chrf,
110
+ "cer": cer,
111
+ "wer": wer
112
+ },
113
+ "downloads": downloads,
114
+ "last_updated": "2025-04-01",
115
+ "model_url": f"https://huggingface.co/{organization}/{model}",
116
+ "paper_url": "https://arxiv.org/abs/2502.14949",
117
+ })
118
+ except Exception as e:
119
+ print(f"Error processing {model}: {e}")
120
+ continue
121
+
122
+ # Create detailed dataset for per-dataset comparisons
123
+ dataset_names = []
124
+ dataset_sizes = []
125
+ dataset_metrics = {}
126
+
127
+ for i in range(2, len(lines)-1): # Skip headers and the average line
128
+ parts = lines[i].split('\t')
129
+ if len(parts) > 1:
130
+ dataset = parts[0]
131
+ size = parts[1] if len(parts) > 1 else "0"
132
+
133
+ dataset_names.append(dataset)
134
+ dataset_sizes.append(size)
135
+
136
+ metrics = {}
137
+ for j, model in enumerate(model_names):
138
+ base_idx = j*3 + 2 # Starting column for each model (+2 for Dataset and Size columns)
139
+ if base_idx + 2 < len(parts):
140
+ try:
141
+ chrf = float(parts[base_idx]) if parts[base_idx] else 0
142
+ cer = float(parts[base_idx + 1]) if parts[base_idx + 1] else 0
143
+ wer = float(parts[base_idx + 2]) if parts[base_idx + 2] else 0
144
+ metrics[model] = {
145
+ "chrf": chrf,
146
+ "cer": cer,
147
+ "wer": wer
148
+ }
149
+ except (ValueError, IndexError) as e:
150
+ print(f"Error parsing metrics for {dataset}, {model}: {e}")
151
+ metrics[model] = {"chrf": 0, "cer": 0, "wer": 0}
152
+
153
+ dataset_metrics[dataset] = metrics
154
 
155
  # Define CSS for styling
156
  css = """
 
167
  .gradio-container {
168
  max-width: 1200px !important;
169
  }
 
 
 
 
 
 
 
 
 
170
  .header {
171
  background: linear-gradient(90deg, #FFDE59 0%, #FFC532 100%);
172
  padding: 20px;
 
215
  font-weight: 600;
216
  color: #374151;
217
  border-bottom: 1px solid #E5E7EB;
218
+ position: sticky;
219
+ top: 0;
220
+ z-index: 10;
221
  }
222
  td {
223
  padding: 12px;
 
251
  .footer a:hover {
252
  text-decoration: underline;
253
  }
254
+ .metric-table {
255
+ max-height: 600px;
256
+ overflow-y: auto;
257
+ }
258
+ .dataset-row:nth-child(odd) {
259
+ background-color: #F9FAFB;
260
+ }
261
+ .dataset-row:hover {
262
+ background-color: #EFF6FF;
263
+ }
264
+ .tab-active {
265
+ border-bottom: 2px solid #2563EB !important;
266
+ color: #2563EB !important;
267
+ font-weight: 600;
268
+ }
269
+ .metric-badge {
270
+ padding: 2px 8px;
271
+ border-radius: 9999px;
272
+ font-weight: 600;
273
+ font-size: 0.75rem;
274
+ display: inline-block;
275
+ }
276
+ .metric-good {
277
+ background-color: #DCFCE7;
278
+ color: #166534;
279
+ }
280
+ .metric-medium {
281
+ background-color: #FEF3C7;
282
+ color: #92400E;
283
+ }
284
+ .metric-poor {
285
+ background-color: #FEE2E2;
286
+ color: #B91C1C;
287
+ }
288
+ .chart-container {
289
+ margin-top: 20px;
290
+ overflow-x: auto;
291
+ }
292
  """
293
 
294
+ # Function to format metrics with color coding
295
+ def format_metric(metric_name, value):
296
+ if metric_name == "chrf":
297
+ if value > 75:
298
+ return f'<span class="metric-badge metric-good">{value:.1f}</span>'
299
+ elif value > 50:
300
+ return f'<span class="metric-badge metric-medium">{value:.1f}</span>'
301
+ else:
302
+ return f'<span class="metric-badge metric-poor">{value:.1f}</span>'
303
+ elif metric_name == "cer" or metric_name == "wer": # Lower is better
304
+ if value < 0.5:
305
+ return f'<span class="metric-badge metric-good">{value:.2f}</span>'
306
+ elif value < 1.0:
307
+ return f'<span class="metric-badge metric-medium">{value:.2f}</span>'
308
+ else:
309
+ return f'<span class="metric-badge metric-poor">{value:.2f}</span>'
310
+ return f"{value:.2f}"
311
 
312
+ # Function to filter models based on type
313
+ def filter_by_type(models, type_filter):
314
  if type_filter == "All":
315
+ return models
316
+ return [model for model in models if model["type"] == type_filter]
317
 
318
+ # Function to filter models based on search term
319
+ def filter_by_search(models, search_term):
320
  if not search_term:
321
+ return models
322
 
323
  # Convert search term to lowercase for case-insensitive search
324
  search_term = search_term.lower()
325
 
326
  # Filter based on model, organization, or task
327
+ filtered_models = []
328
+ for model in models:
329
+ if (search_term in model["model"].lower() or
330
+ search_term in model["organization"].lower() or
331
+ search_term in model["task"].lower()):
332
+ filtered_models.append(model)
333
 
334
+ return filtered_models
335
 
336
+ # Function to generate the main leaderboard HTML
337
+ def generate_main_leaderboard(models, sort_by, sort_order):
338
+ # Sort models
339
+ reverse = sort_order == "Descending"
340
 
341
+ # Define key function for sorting based on metric
342
+ def get_sort_key(model):
343
+ if sort_by == "model" or sort_by == "organization" or sort_by == "type" or sort_by == "task":
344
+ return model[sort_by]
345
+ elif sort_by == "downloads":
346
+ # Extract numeric part from download string (e.g., "24.5K" -> 24.5)
347
+ try:
348
+ return float(model[sort_by].replace("K", ""))
349
+ except:
350
+ return 0
351
+ elif sort_by == "chrf" or sort_by == "cer" or sort_by == "wer":
352
+ return model["metrics"][sort_by]
353
+ return 0
354
+
355
+ # For CER and WER, lower is better so reverse the sort order
356
+ if sort_by in ["cer", "wer"]:
357
+ reverse = not reverse
358
+
359
+ sorted_models = sorted(models, key=get_sort_key, reverse=reverse)
360
+
361
+ html = """
362
+ <div style="overflow-x: auto;">
363
+ <table style="width:100%">
364
+ <thead>
365
+ <tr>
366
+ <th>Model</th>
367
+ <th>Organization</th>
368
+ <th>Type</th>
369
+ <th>Task</th>
370
+ <th>CHrF ↑</th>
371
+ <th>CER ↓</th>
372
+ <th>WER ↓</th>
373
+ <th>Downloads</th>
374
+ <th>Links</th>
375
+ </tr>
376
+ </thead>
377
+ <tbody>
378
+ """
379
 
380
+ for model in sorted_models:
381
+ html += f"""
382
+ <tr>
383
+ <td>
384
+ <div style="font-weight: 500;">{model['model']}</div>
385
+ </td>
386
+ <td>{model['organization']}</td>
387
+ <td>
388
+ <span style="background-color: {'#DBEAFE' if model['type'] == 'Open-source' else '#FEF3C7' if model['type'] == 'Closed-source' else '#E0F2FE'};
389
+ padding: 2px 6px;
390
+ border-radius: 9999px;
391
+ font-size: 0.75rem;">
392
+ {model['type']}
393
+ </span>
394
+ </td>
395
+ <td>
396
+ <span style="background-color: #E0F2FE;
397
+ padding: 2px 6px;
398
+ border-radius: 9999px;
399
+ font-size: 0.75rem;">
400
+ {model['task']}
401
+ </span>
402
+ </td>
403
+ <td>{format_metric('chrf', model['metrics']['chrf'])}</td>
404
+ <td>{format_metric('cer', model['metrics']['cer'])}</td>
405
+ <td>{format_metric('wer', model['metrics']['wer'])}</td>
406
+ <td>{model['downloads']}</td>
407
+ <td>
408
+ <a href="{model['model_url']}" target="_blank">Model</a> |
409
+ <a href="{model['paper_url']}" target="_blank">Paper</a>
410
+ </td>
411
+ </tr>
412
+ """
413
+
414
+ html += """
415
+ </tbody>
416
+ </table>
417
+ </div>
418
+ """
419
+
420
+ return html
421
+
422
+ # Function to generate per-dataset comparison HTML
423
+ def generate_dataset_comparison(selected_datasets, selected_models, metric):
424
+ html = f"""
425
+ <div class="metric-table">
426
+ <table style="width:100%">
427
+ <thead>
428
+ <tr>
429
+ <th>Dataset</th>
430
+ <th>Size</th>
431
+ """
432
+
433
+ for model in selected_models:
434
+ html += f"<th>{model}</th>"
435
+
436
+ html += """
437
+ </tr>
438
+ </thead>
439
+ <tbody>
440
+ """
441
+
442
+ for dataset_idx, dataset in enumerate(selected_datasets):
443
+ size = dataset_sizes[dataset_names.index(dataset)]
444
+
445
+ html += f"""
446
+ <tr class="dataset-row">
447
+ <td style="font-weight: 500;">{dataset}</td>
448
+ <td>{size}</td>
449
+ """
450
+
451
+ for model in selected_models:
452
+ if model in dataset_metrics[dataset]:
453
+ value = dataset_metrics[dataset][model][metric.lower()]
454
+ html += f"<td>{format_metric(metric.lower(), value)}</td>"
455
+ else:
456
+ html += "<td>-</td>"
457
+
458
+ html += "</tr>"
459
+
460
+ html += """
461
+ </tbody>
462
+ </table>
463
+ </div>
464
+ """
465
+
466
+ return html
467
+
468
+ # Create the Gradio interface
469
+ def create_leaderboard_interface():
470
  with gr.Blocks(css=css) as demo:
471
  gr.HTML(f"""
472
  <div class="header">
 
493
  </div>
494
  </div>
495
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
496
 
497
+ with gr.Tabs() as tabs:
498
+ with gr.TabItem("Main Leaderboard", id=0):
499
+ # Filter controls
500
+ with gr.Row(equal_height=True):
501
+ type_filter = gr.Radio(
502
+ ["All", "Open-source", "Closed-source", "Framework"],
503
+ label="Model Type",
504
+ value="All",
505
+ interactive=True
506
+ )
507
+ search_input = gr.Textbox(
508
+ label="Search Models, Organizations, or Tasks",
509
+ placeholder="Type to search...",
510
+ interactive=True
511
+ )
512
 
513
+ with gr.Row(equal_height=True):
514
+ sort_by = gr.Dropdown(
515
+ ["model", "organization", "type", "chrf", "cer", "wer", "downloads"],
516
+ label="Sort by",
517
+ value="chrf",
518
+ interactive=True
519
+ )
520
+
521
+ sort_order = gr.Radio(
522
+ ["Descending", "Ascending"],
523
+ label="Sort Order",
524
+ value="Descending",
525
+ interactive=True
526
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527
 
528
+ # Table output
529
+ leaderboard_output = gr.HTML()
530
+
531
+ # Update function for the main leaderboard
532
+ def update_leaderboard(type_filter, search_term, sort_by, sort_order):
533
+ filtered_models = filter_by_type(models_data, type_filter)
534
+ filtered_models = filter_by_search(filtered_models, search_term)
535
+ html = generate_main_leaderboard(filtered_models, sort_by, sort_order)
536
+
537
+ footer = f"""
538
+ <div class="footer">
539
+ <span>Showing {len(filtered_models)} of {len(models_data)} models</span>
540
+ <div>
541
+ <a href="https://github.com/mbzuai-oryx/KITAB-Bench" target="_blank">GitHub Repository</a>
542
+ <span style="margin: 0 8px;">|</span>
543
+ <a href="https://arxiv.org/abs/2502.14949" target="_blank">KITAB-Bench Paper</a>
544
+ </div>
545
+ </div>
546
+ """
547
+
548
+ return html + footer
549
+
550
+ # Set up event handlers for main leaderboard
551
+ type_filter.change(update_leaderboard, [type_filter, search_input, sort_by, sort_order], leaderboard_output)
552
+ search_input.change(update_leaderboard, [type_filter, search_input, sort_by, sort_order], leaderboard_output)
553
+ sort_by.change(update_leaderboard, [type_filter, search_input, sort_by, sort_order], leaderboard_output)
554
+ sort_order.change(update_leaderboard, [type_filter, search_input, sort_by, sort_order], leaderboard_output)
555
+
556
+ with gr.TabItem("Dataset Comparison", id=1):
557
+ with gr.Row():
558
+ dataset_selector = gr.CheckboxGroup(
559
+ dataset_names,
560
+ label="Select Datasets",
561
+ value=dataset_names[:5], # Default to first 5 datasets
562
+ interactive=True)