import gradio as gr import pandas as pd import numpy as np # Data from the provided CSV data = { "model": [ "GPT-4o", "GPT-4o-mini", "Gemini-2.0-Flash", "Qwen2-VL", "Qwen2.5-VL", "AIN-7B", "Tesseract", "EasyOCR", "Paddle", "Surya", "AzureOCR", "Qaari", "Gemma3", "ArabicNougat" ], "organization": [ "OpenAI", "OpenAI", "Google", "Alibaba", "Alibaba", "MBZUAI", "Google", "JaidedAI", "Baidu", "VikParuchuri", "Microsoft", "NAMAA", "Google", "MohamedRashad" ], "type": [ "Closed-source", "Closed-source", "Closed-source", "Open-source", "Open-source", "Open-source", "Open-source", "Open-source", "Open-source", "Open-source", "Closed-source", "Open-source", "Open-source", "Open-source" ], "task": [ "OCR/Vision", "OCR/Vision", "OCR/Vision", "OCR/Vision", "OCR/Vision", "OCR/Vision", "OCR", "OCR", "OCR", "OCR/Arabic", "OCR/Vision", "OCR/Arabic", "OCR/Vision", "OCR/Document" ], "chrf": [ 61.01, 47.21, 77.95, 33.94, 49.23, 78.33, 39.62, 45.47, 16.73, 20.61, 50.97, 39.77, 30.02, 30.52 ], "cer": [ 0.31, 0.43, 0.13, 1.48, 1.20, 0.20, 0.54, 0.58, 0.79, 4.95, 0.52, 1.80, 1.05, 4.37 ], "wer": [ 0.55, 0.71, 0.32, 1.55, 1.41, 0.28, 0.84, 0.89, 1.02, 5.61, 0.69, 1.93, 1.45, 4.67 ], "downloads": [ "-", "-", "-", "1262K", "3313K", "0.86K", "-", "-", "-", "39K", "-", "5K", "227K", "0.75K" ], "model_url": [ "https://openai.com/index/hello-gpt-4o/", "https://platform.openai.com/docs/models/gpt-4o-mini", "https://deepmind.google/technologies/gemini/flash/", "https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct", "https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct", "https://huggingface.co/MBZUAI/AIN", "https://github.com/tesseract-ocr/tesseract", "https://github.com/JaidedAI/EasyOCR", "https://github.com/PaddlePaddle/PaddleOCR", "https://github.com/VikParuchuri/surya", "https://learn.microsoft.com/en-us/azure/ai-services/computer-vision/overview-ocr", "https://huggingface.co/NAMAA-Space/Qari-OCR-0.1-VL-2B-Instruct", "https://huggingface.co/google/gemma-3-12b-it", "https://huggingface.co/MohamedRashad/arabic-large-nougat" ], "paper_url": [ "https://arxiv.org/abs/2410.21276", "https://arxiv.org/abs/2410.21276", "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/", "https://arxiv.org/abs/2409.12191", "https://arxiv.org/abs/2502.13923", "https://arxiv.org/abs/2502.00094", "https://github.com/tesseract-ocr/tesseract", "https://github.com/JaidedAI/EasyOCR", "https://arxiv.org/abs/2206.03001", "https://github.com/VikParuchuri/surya", "https://learn.microsoft.com/en-us/azure/ai-services/computer-vision/overview-ocr", "https://huggingface.co/NAMAA-Space/Qari-OCR-0.1-VL-2B-Instruct", "https://developers.googleblog.com/en/introducing-gemma3/", "https://arxiv.org/abs/2411.17835" ] } df = pd.DataFrame(data) def format_dataframe(df): formatted_df = df.copy() formatted_df['chrf'] = formatted_df['chrf'].apply( lambda x: f" 60 else '#F59E0B' if x > 40 else '#EF4444'}'>{x:.1f}" ) formatted_df['cer'] = formatted_df['cer'].apply( lambda x: f"{x:.2f}" ) formatted_df['wer'] = formatted_df['wer'].apply( lambda x: f"{x:.2f}" ) formatted_df['model'] = formatted_df.apply( lambda row: f"{row['model']}", axis=1 ) formatted_df['paper'] = formatted_df.apply( lambda row: f"Paper", axis=1 ) formatted_df['type'] = formatted_df['type'].apply( lambda x: f"{x}" ) formatted_df['task'] = formatted_df['task'].apply( lambda x: f"{x}" ) formatted_df = formatted_df.drop(columns=['model_url', 'paper_url']) return formatted_df css = """ #leaderboard-title { text-align: center; margin-bottom: 0; } #leaderboard-subtitle { text-align: center; margin-top: 0; color: #6B7280; font-size: 1rem; } .gradio-container { max-width: 1200px !important; } .hf-logo { display: flex; align-items: center; justify-content: center; margin-bottom: 1rem; } .hf-logo img { height: 50px; } .header { background: linear-gradient(90deg, #FFDE59 0%, #FFC532 100%); padding: 20px; border-radius: 8px; margin-bottom: 20px; display: flex; align-items: center; justify-content: space-between; } .header img { height: 40px; margin-right: 15px; } .header-content { display: flex; align-items: center; } .header-text { display: flex; flex-direction: column; } .header-text h1 { margin: 0; font-size: 1.5rem; font-weight: bold; color: black; } .header-text p { margin: 0; color: rgba(0, 0, 0, 0.8); } .filter-container { display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 20px; } table { width: 100%; border-collapse: collapse; } th { background-color: #F9FAFB; text-align: left; padding: 12px; font-weight: 600; color: #374151; border-bottom: 1px solid #E5E7EB; } td { padding: 12px; border-bottom: 1px solid #E5E7EB; } tr:hover { background-color: #F9FAFB; } a { color: #2563EB; text-decoration: none; } a:hover { text-decoration: underline; } .footer { display: flex; justify-content: space-between; align-items: center; padding: 10px 0; color: #6B7280; font-size: 0.875rem; margin-top: 20px; } .footer a { color: #2563EB; text-decoration: none; display: inline-flex; align-items: center; } .footer a:hover { text-decoration: underline; } """ # Hugging Face logo SVG (in-lined for simplicity) hf_logo = """ """ def filter_by_type(df, type_filter): if type_filter == "All": return df return df[df["type"].str.contains(type_filter)] def filter_by_search(df, search_term): if not search_term: return df search_term = search_term.lower() mask = ( df["model"].str.lower().str.contains(search_term) | df["organization"].str.lower().str.contains(search_term) | df["task"].str.lower().str.contains(search_term) ) return df[mask] def create_leaderboard_interface(): df_orig = pd.DataFrame(data) df_orig = df_orig.sort_values(by="cer", ascending=True) with gr.Blocks(css=css) as demo: gr.HTML(f"""
Arabic OCR and Document Understanding Benchmark
Model | Organization | Type | Task | CHrF ↑ | CER ↓ | WER ↓ | Downloads | Paper |
---|---|---|---|---|---|---|---|---|
{row['model']} | {row['organization']} | {row['type']} | {row['task']} | {row['chrf']} | {row['cer']} | {row['wer']} | {row['downloads']} | {row['paper']} |
For more information about the KITAB-Bench, visit the project website.