Spaces:

TIGER-Lab
/

MMEB-Leaderboard

Running

App Files Files Community

add self upload feature

#36

by MINGYISU - opened 12 days ago

base: refs/heads/main

←

from: refs/pr/36

Discussion Files changed

+187

-111

Files changed (8) hide show

.gitattributes +0 -1
.gitignore +0 -2
app.py +4 -4
overview.png +0 -3
results.csv +30 -0
results.jsonl +0 -30
urls.csv +24 -0
utils.py +129 -71

.gitattributes CHANGED Viewed

@@ -33,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-overview.png filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -11,5 +11,3 @@ eval-results/
 eval-queue-bk/
 eval-results-bk/
 logs/
-.gitignore
-.gradio

 eval-queue-bk/
 eval-results-bk/
 logs/

app.py CHANGED Viewed

@@ -52,10 +52,11 @@ with gr.Blocks() as block:
                     label="Maximum number of parameters (B)",
                 )
             with gr.Row():
                 tasks_select = gr.CheckboxGroup(
-                    choices=TASKS_V1 + TASKS_V2,
-                    value=TASKS_V1,
                     label="Select tasks to Display",
                     elem_id="tasks-select"
                 )
@@ -100,11 +101,10 @@ with gr.Blocks() as block:
         # table 2
         with gr.TabItem("📝 About", elem_id="qa-tab-table2", id=2):
             gr.Markdown(LEADERBOARD_INFO, elem_classes="markdown-text")
-            gr.Image("overview.png", width=900, label="Dataset Overview")
         # table 3
         with gr.TabItem("🚀 Submit here! ", elem_id="submit-tab", id=3):
             with gr.Row():
                 gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
-block.launch()

                     label="Maximum number of parameters (B)",
                 )
+            task_choices = [col for col in COLUMN_NAMES if col not in BASE_COLS]
             with gr.Row():
                 tasks_select = gr.CheckboxGroup(
+                    choices=task_choices,
+                    value=task_choices,
                     label="Select tasks to Display",
                     elem_id="tasks-select"
                 )
         # table 2
         with gr.TabItem("📝 About", elem_id="qa-tab-table2", id=2):
             gr.Markdown(LEADERBOARD_INFO, elem_classes="markdown-text")
         # table 3
         with gr.TabItem("🚀 Submit here! ", elem_id="submit-tab", id=3):
             with gr.Row():
                 gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
+block.launch(share=True)

overview.png DELETED Viewed

Git LFS Details

SHA256: 80e1cd3d78d9a2e5523edc1dbf8ca37cfac20a6ed5772e5a198ee920aa114774
Pointer size: 131 Bytes
Size of remote file: 260 kB

results.csv ADDED Viewed

	@@ -0,0 +1,30 @@

+Models,Model Size(B),Data Source,Overall,Classification,VQA,Retrieval,Grounding
+clip-vit-large-patch14,0.428,TIGER-Lab,37.8,42.8,9.1,53.0,51.8
+blip2-opt-2.7b,3.74,TIGER-Lab,25.2,27.0,4.2,33.9,47.0
+siglip-base-patch16-224,0.203,TIGER-Lab,34.8,40.3,8.4,31.6,59.5
+open_clip-ViT-L/14,0.428,TIGER-Lab,39.7,47.8,10.9,52.3,53.3
+UniIR (BLIP_FF),0.247,TIGER-Lab,42.8,42.1,15.0,60.1,62.2
+UniIR (CLIP_SF),0.428,TIGER-Lab,44.7,44.3,16.2,61.8,65.3
+e5-v,8.36,TIGER-Lab,13.3,21.8,4.9,11.5,19.0
+Magiclens,0.428,TIGER-Lab,27.8,38.8,8.3,35.4,26.0
+CLIP-FT,0.428,TIGER-Lab,45.4,55.2,19.7,53.2,62.2
+OpenCLIP-FT,0.428,TIGER-Lab,47.2,56.0,21.9,55.4,64.1
+VLM2Vec (Phi-3.5-V-FT),4.15,TIGER-Lab,55.9,52.8,50.3,57.8,72.3
+VLM2Vec (Phi-3.5-V-LoRA),4.15,TIGER-Lab,60.1,54.8,54.9,62.3,79.5
+VLM2Vec (LLaVA-1.6-LoRA-LowRes),7.57,TIGER-Lab,55.0,54.7,50.3,56.2,64.0
+VLM2Vec (LLaVA-1.6-LoRA-HighRes),7.57,TIGER-Lab,62.9,61.2,49.9,67.4,86.1
+MMRet-MLLM (LLaVA-1.6),7.57,Self-Reported,44.0,47.2,18.4,56.5,62.2
+MMRet-MLLM (FT),7.57,Self-Reported,64.1,56.0,57.4,69.9,83.6
+mmE5-mllama-11b-instruct,10.6,Self-Reported,69.8,67.6,62.6,71.0,89.6
+mmE5 (w/ 560K synthetic data),10.6,Self-Reported,58.6,60.6,55.7,54.7,72.4
+MM-Embed,8.18,Self-Reported,50.0,48.1,32.3,63.8,57.8
+gme-Qwen2-VL-2B-Instruct,2.21,Self-Reported,55.8,56.9,41.2,67.8,53.4
+VLM2Vec (Qwen2-VL-7B-LoRA-HighRes),8.29,TIGER-Lab,65.8,62.6,57.8,69.9,81.7
+VLM2Vec (Qwen2-VL-2B-LoRA-HighRes),2.21,TIGER-Lab,59.3,59.0,49.4,65.4,73.4
+LLaVE-7B,8.03,Self-Reported,70.3,65.7,65.4,70.9,91.9
+LLaVE-2B,1.95,Self-Reported,65.2,62.1,60.2,65.2,84.9
+LLaVE-0.5B,0.894,Self-Reported,59.1,57.4,50.3,59.8,82.9
+UniME(LLaVA-OneVision-7B-LoRA-Res336),8.03,Self-Reported,70.7,66.8,66.6,70.5,90.9
+UniME(LLaVA-1.6-7B-LoRA-LowRes),7.57,Self-Reported,66.6,60.6,52.9,67.9,85.1
+UniME(Phi-3.5-V-LoRA),4.2,Self-Reported,64.2,54.8,55.9,64.5,81.8
+QQMM-embed,8.297,Self-Reported,72.175,70.07,69.52,71.175,87.075

results.jsonl DELETED Viewed

@@ -1,30 +0,0 @@
-{"Models":"B3","Model Size(B)":8.29,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":72.0,"I-CLS":70.0,"I-QA":66.5,"I-RET":74.1,"I-VG":84.6,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/raghavlite\/B3_Qwen2_7B"}
-{"Models":"CLIP-FT","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":45.4,"I-CLS":55.2,"I-QA":19.7,"I-RET":53.2,"I-VG":62.2,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/doi.org\/10.48550\/arXiv.2103.00020"}
-{"Models":"LLaVE-0.5B","Model Size(B)":0.894,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":59.1,"I-CLS":57.4,"I-QA":50.3,"I-RET":59.8,"I-VG":82.9,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/zhibinlan\/LLaVE-0.5B"}
-{"Models":"LLaVE-2B","Model Size(B)":1.95,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":65.2,"I-CLS":62.1,"I-QA":60.2,"I-RET":65.2,"I-VG":84.9,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/zhibinlan\/LLaVE-2B"}
-{"Models":"LLaVE-7B","Model Size(B)":8.03,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":70.3,"I-CLS":65.7,"I-QA":65.4,"I-RET":70.9,"I-VG":91.9,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/zhibinlan\/LLaVE-7B"}
-{"Models":"MM-Embed","Model Size(B)":8.18,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":50.0,"I-CLS":48.1,"I-QA":32.3,"I-RET":63.8,"I-VG":57.8,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/nvidia\/MM-Embed"}
-{"Models":"MMRet-MLLM (FT)","Model Size(B)":7.57,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":64.1,"I-CLS":56.0,"I-QA":57.4,"I-RET":69.9,"I-VG":83.6,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/JUNJIE99\/MMRet-large"}
-{"Models":"MMRet-MLLM (LLaVA-1.6)","Model Size(B)":7.57,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":44.0,"I-CLS":47.2,"I-QA":18.4,"I-RET":56.5,"I-VG":62.2,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/JUNJIE99\/MMRet-large"}
-{"Models":"Magiclens","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":27.8,"I-CLS":38.8,"I-QA":8.3,"I-RET":35.4,"I-VG":26.0,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/github.com\/google-deepmind\/magiclens"}
-{"Models":"OpenCLIP-FT","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":47.2,"I-CLS":56.0,"I-QA":21.9,"I-RET":55.4,"I-VG":64.1,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/doi.org\/10.48550\/arXiv.2212.07143"}
-{"Models":"QQMM-embed","Model Size(B)":8.297,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":72.175,"I-CLS":70.07,"I-QA":69.52,"I-RET":71.175,"I-VG":87.075,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/github.com\/QQ-MM\/QQMM-embed"}
-{"Models":"UniIR (BLIP_FF)","Model Size(B)":0.247,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":42.8,"I-CLS":42.1,"I-QA":15.0,"I-RET":60.1,"I-VG":62.2,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/UniIR"}
-{"Models":"UniIR (CLIP_SF)","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":44.7,"I-CLS":44.3,"I-QA":16.2,"I-RET":61.8,"I-VG":65.3,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/UniIR"}
-{"Models":"UniME(LLaVA-1.6-7B-LoRA-LowRes)","Model Size(B)":7.57,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":66.6,"I-CLS":60.6,"I-QA":52.9,"I-RET":67.9,"I-VG":85.1,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/DeepGlint-AI\/UniME-LLaVA-1.6-7B"}
-{"Models":"UniME(LLaVA-OneVision-7B-LoRA-Res336)","Model Size(B)":8.03,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":70.7,"I-CLS":66.8,"I-QA":66.6,"I-RET":70.5,"I-VG":90.9,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/DeepGlint-AI\/UniME-LLaVA-OneVision-7B"}
-{"Models":"UniME(Phi-3.5-V-LoRA)","Model Size(B)":4.2,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":64.2,"I-CLS":54.8,"I-QA":55.9,"I-RET":64.5,"I-VG":81.8,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/DeepGlint-AI\/UniME-Phi3.5-V-4.2B"}
-{"Models":"VLM2Vec (LLaVA-1.6-LoRA-HighRes)","Model Size(B)":7.57,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":62.9,"I-CLS":61.2,"I-QA":49.9,"I-RET":67.4,"I-VG":86.1,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-LLaVa-Next"}
-{"Models":"VLM2Vec (LLaVA-1.6-LoRA-LowRes)","Model Size(B)":7.57,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":55.0,"I-CLS":54.7,"I-QA":50.3,"I-RET":56.2,"I-VG":64.0,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-LLaVa-Next"}
-{"Models":"VLM2Vec (Phi-3.5-V-FT)","Model Size(B)":4.15,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":55.9,"I-CLS":52.8,"I-QA":50.3,"I-RET":57.8,"I-VG":72.3,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Full"}
-{"Models":"VLM2Vec (Phi-3.5-V-LoRA)","Model Size(B)":4.15,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":60.1,"I-CLS":54.8,"I-QA":54.9,"I-RET":62.3,"I-VG":79.5,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Full"}
-{"Models":"VLM2Vec (Qwen2-VL-2B-LoRA-HighRes)","Model Size(B)":2.21,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":59.3,"I-CLS":59.0,"I-QA":49.4,"I-RET":65.4,"I-VG":73.4,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Qwen2VL-2B"}
-{"Models":"VLM2Vec (Qwen2-VL-7B-LoRA-HighRes)","Model Size(B)":8.29,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":65.8,"I-CLS":62.6,"I-QA":57.8,"I-RET":69.9,"I-VG":81.7,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Qwen2VL-7B"}
-{"Models":"blip2-opt-2.7b","Model Size(B)":3.74,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":25.2,"I-CLS":27.0,"I-QA":4.2,"I-RET":33.9,"I-VG":47.0,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/Salesforce\/blip2-opt-2.7b"}
-{"Models":"clip-vit-large-patch14","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":37.8,"I-CLS":42.8,"I-QA":9.1,"I-RET":53.0,"I-VG":51.8,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/openai\/clip-vit-large-patch14"}
-{"Models":"e5-v","Model Size(B)":8.36,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":13.3,"I-CLS":21.8,"I-QA":4.9,"I-RET":11.5,"I-VG":19.0,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/royokong\/e5-v"}
-{"Models":"gme-Qwen2-VL-2B-Instruct","Model Size(B)":2.21,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":55.8,"I-CLS":56.9,"I-QA":41.2,"I-RET":67.8,"I-VG":53.4,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/Alibaba-NLP\/gme-Qwen2-VL-2B-Instruct"}
-{"Models":"mmE5 (w\/ 560K synthetic data)","Model Size(B)":10.6,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":58.6,"I-CLS":60.6,"I-QA":55.7,"I-RET":54.7,"I-VG":72.4,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/intfloat\/mmE5-mllama-11b-instruct"}
-{"Models":"mmE5-mllama-11b-instruct","Model Size(B)":10.6,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":69.8,"I-CLS":67.6,"I-QA":62.6,"I-RET":71.0,"I-VG":89.6,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/intfloat\/mmE5-mllama-11b-instruct"}
-{"Models":"open_clip-ViT-L\/14","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":39.7,"I-CLS":47.8,"I-QA":10.9,"I-RET":52.3,"I-VG":53.3,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/github.com\/mlfoundations\/open_clip"}
-{"Models":"siglip-base-patch16-224","Model Size(B)":0.203,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":34.8,"I-CLS":40.3,"I-QA":8.4,"I-RET":31.6,"I-VG":59.5,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/google\/siglip-base-patch16-224"}

urls.csv ADDED Viewed

	@@ -0,0 +1,24 @@

+Models,URL
+clip-vit-large-patch14,https://huggingface.co/openai/clip-vit-large-patch14
+blip2-opt-2.7b,https://huggingface.co/Salesforce/blip2-opt-2.7b
+siglip-base-patch16-224,https://huggingface.co/google/siglip-base-patch16-224
+open_clip-ViT-L/14,https://github.com/mlfoundations/open_clip
+e5-v,https://huggingface.co/royokong/e5-v
+Magiclens,https://github.com/google-deepmind/magiclens
+MMRet,https://huggingface.co/JUNJIE99/MMRet-large
+VLM2Vec-Phi-3.5-v,https://huggingface.co/TIGER-Lab/VLM2Vec-Full
+VLM2Vec,https://github.com/TIGER-AI-Lab/VLM2Vec
+VLM2Vec (Qwen2-VL-7B-LoRA-HighRes),https://huggingface.co/TIGER-Lab/VLM2Vec-Qwen2VL-7B
+VLM2Vec (Qwen2-VL-2B-LoRA-HighRes),https://huggingface.co/TIGER-Lab/VLM2Vec-Qwen2VL-2B
+UniIR,https://huggingface.co/TIGER-Lab/UniIR
+OpenCLIP-FT,https://doi.org/10.48550/arXiv.2212.07143
+CLIP-FT,https://doi.org/10.48550/arXiv.2103.00020
+mmE5,https://huggingface.co/intfloat/mmE5-mllama-11b-instruct
+gme-Qwen2-VL-2B-Instruct,https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct
+MM-Embed,https://huggingface.co/nvidia/MM-Embed
+LLaVE-7B,https://huggingface.co/zhibinlan/LLaVE-7B
+LLaVE-2B,https://huggingface.co/zhibinlan/LLaVE-2B
+LLaVE-0.5B,https://huggingface.co/zhibinlan/LLaVE-0.5B
+UniME(LLaVA-OneVision-7B-LoRA-Res336),https://huggingface.co/DeepGlint-AI/UniME-LLaVA-OneVision-7B
+UniME(LLaVA-1.6-7B-LoRA-LowRes),https://huggingface.co/DeepGlint-AI/UniME-LLaVA-1.6-7B
+UniME(Phi-3.5-V-LoRA),https://huggingface.co/DeepGlint-AI/UniME-Phi3.5-V-4.2B

utils.py CHANGED Viewed

@@ -10,39 +10,64 @@ from huggingface_hub import Repository
 HF_TOKEN = os.environ.get("HF_TOKEN")
-BASE_COLS = ["Rank", "Models", "Model Size(B)", "Data Source"]
-TASKS_V1 = ["V1-Overall", "I-CLS", "I-QA", "I-RET", "I-VG"]
-TASKS_V2 = ["V2-Overall", "V-CLS", "V-QA", "V-RET", "V-MRET", "VisDoc"]
-COLUMN_NAMES = BASE_COLS + TASKS_V1 + TASKS_V2
-DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown'] + \
-                    ['number'] * len(TASKS_V1 + TASKS_V2)
 LEADERBOARD_INTRODUCTION = """
-# 📊 **MMEB LEADERBOARD (V1 & V2)**
 ## Introduction
-We introduce a novel benchmark, **MMEB-V1 (Massive Multimodal Embedding Benchmark)**,
 which includes 36 datasets spanning four meta-task categories: classification, visual question answering, retrieval, and visual grounding. MMEB provides a comprehensive framework for training
 and evaluating embedding models across various combinations of text and image modalities.
 All tasks are reformulated as ranking tasks, where the model follows instructions, processes a query, and selects the correct target from a set of candidates. The query and target can be an image, text,
-or a combination of both. MMEB-V1 is divided into 20 in-distribution datasets, which can be used for
 training, and 16 out-of-distribution datasets, reserved for evaluation.
-Building upon on **MMEB-V1**, **MMEB-V2** expands the evaluation scope to include five new tasks: four video-based tasks
-— Video Retrieval, Moment Retrieval, Video Classification, and Video Question Answering — and one task focused on visual documents, Visual Document Retrieval.
-This comprehensive suite enables robust evaluation of multimodal embedding models across static, temporal, and structured visual data settings.
-| [**📈Overview**](https://tiger-ai-lab.github.io/VLM2Vec/) | [**Github**](https://github.com/TIGER-AI-Lab/VLM2Vec)
-| [**📖MMEB-V2/VLM2Vec-V2 Paper (TBA)**](https://arxiv.org/abs/2410.05160)
-| [**📖MMEB-V1/VLM2Vec-V1 Paper**](https://arxiv.org/abs/2410.05160)
-| [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2) |
 """
-TABLE_INTRODUCTION = """Models are ranked based on V1-Overall."""
 LEADERBOARD_INFO = """
 ## Dataset Summary
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
@@ -56,84 +81,112 @@ CITATION_BUTTON_TEXT = r"""@article{jiang2024vlm2vec,
 SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
 ## ⚠ Please note that you need to submit the JSON file with the following format:
-### **TO SUBMIT V1 ONLY**
 ```json
 [
     {
         "Model": "<Model Name>",
-        "URL": "<Model URL>" or null,
-        "Model Size(B)": 1000 or null,
-        "Data Source": "Self-Reported",
-        "V1-Overall": 50.0,
-        "I-CLS": 50.0,
-        "I-QA": 50.0,
-        "I-RET": 50.0,
-        "I-VG": 50.0
     },
 ]
 ```
-### **TO SUBMIT V2 ONLY**
-```json
-[
-    {
-        "Model": "<Model Name>",
-        "URL": "<Model URL>" or null,
-        "Model Size(B)": 1000 or null,
-        "Data Source": "Self-Reported",
-        "V2-Overall": 50.0,
-        "V-CLS": 50.0,
-        "V-QA": 50.0,
-        "V-RET": 50.0,
-        "V-MRET": 50.0,
-        "VisDoc": 50.0
-    },
-]
-```
-You are also welcome to submit both versions by including all the fields above! :) \n
-You may refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for instructions about evaluating your model. \n
 Please send us an email at [email protected], attaching the JSON file. We will review your submission and update the leaderboard accordingly.
 """
 def create_hyperlinked_names(df):
     def convert_url(url, model_name):
-        return f'<a href="{url}">{model_name}</a>' if url is not None else model_name
-    def add_link_to_model_name(row):
-        row['Models'] = convert_url(row['URL'], row['Models'])
-        return row
     df = df.copy()
-    df = df.apply(add_link_to_model_name, axis=1)
     return df
-# def fetch_data(file: str) -> pd.DataFrame:
-#     # fetch the leaderboard data from remote
-#     if file is None:
-#         raise ValueError("URL Not Provided")
-#     url = f"https://huggingface.co/spaces/TIGER-Lab/MMEB/resolve/main/{file}"
-#     print(f"Fetching data from {url}")
-#     response = requests.get(url)
-#     if response.status_code != 200:
-#         raise requests.HTTPError(f"Failed to fetch data: HTTP status code {response.status_code}")
-#     return pd.read_json(io.StringIO(response.text), orient='records', lines=True)
-def get_df(file="results.jsonl"):
-    df = pd.read_json(file, orient='records', lines=True)
     df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
-    for task in TASKS_V1 + TASKS_V2:
-        if df[task].isnull().any():
-            df[task] = df[task].apply(lambda score: '-' if pd.isna(score) else score)
-    df = df.sort_values(by=['V1-Overall'], ascending=False)
     df = create_hyperlinked_names(df)
     df['Rank'] = range(1, len(df) + 1)
     return df
 def refresh_data():
     df = get_df()
     return df[COLUMN_NAMES]
 def search_and_filter_models(df, query, min_size, max_size):
     filtered_df = df.copy()
@@ -170,6 +223,7 @@ def process_model_size(size):
     except (ValueError, TypeError):
         return 'unknown'
 def filter_columns_by_tasks(df, selected_tasks=None):
     if selected_tasks is None or len(selected_tasks) == 0:
         return df[COLUMN_NAMES]
@@ -179,3 +233,7 @@ def filter_columns_by_tasks(df, selected_tasks=None):
     available_columns = [col for col in selected_columns if col in df.columns]
     return df[available_columns]

 HF_TOKEN = os.environ.get("HF_TOKEN")
+TASKS = ["Classification", "VQA", "Retrieval", "Grounding"]
+MODEL_INFO = [
+    "Rank", "Models", "Model Size(B)", "Data Source",
+    "Overall",
+    "Classification", "VQA", "Retrieval", "Grounding"
+]
+BASE_COLS = [col for col in MODEL_INFO if col not in TASKS]
+DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
+SUBMISSION_NAME = "MMEB"
+SUBMISSION_URL = os.path.join("https://huggingface.co/spaces/TIGER-Lab/", SUBMISSION_NAME)
+FILE_NAME = "results.csv"
+CSV_DIR = "./results.csv"
+COLUMN_NAMES = MODEL_INFO
 LEADERBOARD_INTRODUCTION = """
+# MMEB Leaderboard
 ## Introduction
+We introduce a novel benchmark, MMEB (Massive Multimodal Embedding Benchmark),
 which includes 36 datasets spanning four meta-task categories: classification, visual question answering, retrieval, and visual grounding. MMEB provides a comprehensive framework for training
 and evaluating embedding models across various combinations of text and image modalities.
 All tasks are reformulated as ranking tasks, where the model follows instructions, processes a query, and selects the correct target from a set of candidates. The query and target can be an image, text,
+or a combination of both. MMEB is divided into 20 in-distribution datasets, which can be used for
 training, and 16 out-of-distribution datasets, reserved for evaluation.
+The detailed explanation of the benchmark and datasets can be found in our paper: https://doi.org/10.48550/arXiv.2410.05160. \n
+Github link: https://github.com/TIGER-AI-Lab/VLM2Vec. \n
+Overview: https://tiger-ai-lab.github.io/VLM2Vec/. \n
 """
+TABLE_INTRODUCTION = """"""
 LEADERBOARD_INFO = """
 ## Dataset Summary
+MMEB is organized into four primary meta-task categories:
+- **Classification**: This category comprises 5 in-distribution and 5 out-of-distribution datasets. Queries
+consist of instructions and images, optionally accompanied by related text. Targets are class labels,
+and the number of class labels corresponds to the number of classes in the dataset. \n
+        - IND: ImageNet-1k, N24News, HatefulMemes, VOC2007, SUN397 \n
+        - OOD: Place365, ImageNet-A, ImageNet-R, ObjectNet, Country-211 \n
+- **Visual Question Answering**: This category includes 6 in-distribution and 4 out-of-distribution
+datasets. The query consists of an instruction, an image, and a piece of text as the question, while
+the target is the answer. Each query has 1,000 target candidates: 1 ground truth and 999 distractors. \n
+        - IND: OK-VQA, A-OKVQA, DocVQA, InfographicVQA, ChartQA, Visual7W \n
+        - OOD: ScienceQA, VizWiz, GQA, TextVQA \n
+- **Information Retrieval**: This category contains 8 in-distribution and 4 out-of-distribution datasets.
+Both the query and target sides can involve a combination of text, images, and instructions. Similar
+to the VQA task, each query has 1,000 candidates, with 1 ground truth and 999 distractors. \n
+        - IND: VisDial, CIRR, VisualNews_t2i, VisualNews_i2t, MSCOCO_t2i, MSCOCO_i2t, NIGHTS, WebQA \n
+        - OOD: OVEN, FashionIQ, EDIS, Wiki-SS-NQ \n
+- **Visual Grounding**: This category includes 1 in-distribution and 3 out-of-distribution datasets, which are adapted from object detection tasks. Queries consist of an instruction, an image, and text referring to a specific region or object within the image. The target may include a cropped image of the object or text describing the same region. Each query includes 1,000 candidates: 1 ground truth and 999 distractors. These distractors may include hard negatives from the same object class, other objects in the image, or random objects from different images. \n
+        - IND: MSCOCO \n
+        - OOD: Visual7W-Pointing, RefCOCO, RefCOCO-Matching \n
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
 ## ⚠ Please note that you need to submit the JSON file with the following format:
 ```json
 [
     {
         "Model": "<Model Name>",
+        <Optional>"URL": "<Model URL>",
+        "Model Size(B)": 1000,
+        "Data Source": Self-Reported,
+        "Overall": 50.0,
+        "Classification": 50.0,
+        "VQA": 50.0,
+        "Retrieval": 50.0,
+        "Grounding": 50.0
     },
 ]
 ```
+You may refer to the Github page for instructions about evaluating your model.
+Github link: https://github.com/TIGER-AI-Lab/VLM2Vec. \n
 Please send us an email at [email protected], attaching the JSON file. We will review your submission and update the leaderboard accordingly.
 """
+MODEL_URLS = pd.read_csv("urls.csv")
+MODEL_URLS = dict(zip(MODEL_URLS['Models'], MODEL_URLS['URL']))
 def create_hyperlinked_names(df):
     def convert_url(url, model_name):
+        return f'<a href="{url}">{model_name}</a>'
+    def add_link_to_model_name(model_name):
+        if "VLM2Vec (Phi-3.5-V-" in model_name:
+            url = MODEL_URLS["VLM2Vec-Phi-3.5-v"]
+            return convert_url(url, model_name)
+        if "VLM2Vec (LLaVA-1.6-LoRA-" in model_name:
+            url = MODEL_URLS["VLM2Vec"]
+            return convert_url(url, model_name)
+        if "UniIR" in model_name:
+            url = MODEL_URLS["UniIR"]
+            return convert_url(url, model_name)
+        if "mmE5" in model_name:
+            url = MODEL_URLS["mmE5"]
+            return convert_url(url, model_name)
+        if "MMRet" in model_name:
+            url = MODEL_URLS["MMRet"]
+            return convert_url(url, model_name)
+        return convert_url(MODEL_URLS[model_name], model_name) if model_name in MODEL_URLS else model_name
     df = df.copy()
+    df['Models'] = df['Models'].apply(add_link_to_model_name)
     return df
+def get_df():
+    # fetch the leaderboard data
+    url = "https://huggingface.co/spaces/TIGER-Lab/MMEB/resolve/main/results.csv"
+    response = requests.get(url, headers={"Authorization": f"Bearer {HF_TOKEN}"})
+    if response.status_code != 200:
+        import sys
+        sys.exit(f"Error: {response.status_code}")
+    df = pd.read_csv(io.StringIO(response.text))
+    df.to_csv(CSV_DIR, index=False) # update local file
     df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
+    df = df.sort_values(by=['Overall'], ascending=False)
     df = create_hyperlinked_names(df)
     df['Rank'] = range(1, len(df) + 1)
     return df
+def add_new_eval(input_file):
+    if input_file is None:
+        return "Error! Empty file!"
+    # Load the input json file
+    upload_data = json.loads(input_file)
+    print("upload_data:\n", upload_data)
+    data_row = [f'{upload_data["Model"]}']
+    for col in ['Overall', 'Model Size(B)'] + TASKS:
+        if not col in upload_data.keys():
+            return f"Error! Missing {col} column!"
+        data_row += [upload_data[col]]
+    if 'URL' in upload_data.keys():
+        MODEL_URLS[upload_data['Model']] = upload_data['URL']
+    print("data_row:\n", data_row)
+    submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
+                                 use_auth_token=HF_TOKEN, repo_type="space")
+    submission_repo.git_pull()
+    # Track submitted models
+    already_submitted = []
+    with open(CSV_DIR, mode='r') as file:
+        reader = csv.reader(file, delimiter=',')
+        for row in reader:
+            already_submitted.append(row[0])
+    # if not in the existing models list, add it to the csv file
+    if data_row[0] not in already_submitted:
+        with open(CSV_DIR, mode='a', newline='') as file:
+            writer = csv.writer(file)
+            writer.writerow(data_row)
+        submission_repo.push_to_hub()
+        print('Submission Successful')
+    else:
+        print('The model already exists in the leaderboard!')
 def refresh_data():
     df = get_df()
     return df[COLUMN_NAMES]
 def search_and_filter_models(df, query, min_size, max_size):
     filtered_df = df.copy()
     except (ValueError, TypeError):
         return 'unknown'
 def filter_columns_by_tasks(df, selected_tasks=None):
     if selected_tasks is None or len(selected_tasks) == 0:
         return df[COLUMN_NAMES]
     available_columns = [col for col in selected_columns if col in df.columns]
     return df[available_columns]
+def get_task_choices():
+    return TASKS