add self upload feature

#36
by MINGYISU - opened
Files changed (8) hide show
  1. .gitattributes +0 -1
  2. .gitignore +0 -2
  3. app.py +4 -4
  4. overview.png +0 -3
  5. results.csv +30 -0
  6. results.jsonl +0 -30
  7. urls.csv +24 -0
  8. utils.py +129 -71
.gitattributes CHANGED
@@ -33,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- overview.png filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
.gitignore CHANGED
@@ -11,5 +11,3 @@ eval-results/
11
  eval-queue-bk/
12
  eval-results-bk/
13
  logs/
14
- .gitignore
15
- .gradio
 
11
  eval-queue-bk/
12
  eval-results-bk/
13
  logs/
 
 
app.py CHANGED
@@ -52,10 +52,11 @@ with gr.Blocks() as block:
52
  label="Maximum number of parameters (B)",
53
  )
54
 
 
55
  with gr.Row():
56
  tasks_select = gr.CheckboxGroup(
57
- choices=TASKS_V1 + TASKS_V2,
58
- value=TASKS_V1,
59
  label="Select tasks to Display",
60
  elem_id="tasks-select"
61
  )
@@ -100,11 +101,10 @@ with gr.Blocks() as block:
100
  # table 2
101
  with gr.TabItem("📝 About", elem_id="qa-tab-table2", id=2):
102
  gr.Markdown(LEADERBOARD_INFO, elem_classes="markdown-text")
103
- gr.Image("overview.png", width=900, label="Dataset Overview")
104
 
105
  # table 3
106
  with gr.TabItem("🚀 Submit here! ", elem_id="submit-tab", id=3):
107
  with gr.Row():
108
  gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
109
 
110
- block.launch()
 
52
  label="Maximum number of parameters (B)",
53
  )
54
 
55
+ task_choices = [col for col in COLUMN_NAMES if col not in BASE_COLS]
56
  with gr.Row():
57
  tasks_select = gr.CheckboxGroup(
58
+ choices=task_choices,
59
+ value=task_choices,
60
  label="Select tasks to Display",
61
  elem_id="tasks-select"
62
  )
 
101
  # table 2
102
  with gr.TabItem("📝 About", elem_id="qa-tab-table2", id=2):
103
  gr.Markdown(LEADERBOARD_INFO, elem_classes="markdown-text")
 
104
 
105
  # table 3
106
  with gr.TabItem("🚀 Submit here! ", elem_id="submit-tab", id=3):
107
  with gr.Row():
108
  gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
109
 
110
+ block.launch(share=True)
overview.png DELETED

Git LFS Details

  • SHA256: 80e1cd3d78d9a2e5523edc1dbf8ca37cfac20a6ed5772e5a198ee920aa114774
  • Pointer size: 131 Bytes
  • Size of remote file: 260 kB
results.csv ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Models,Model Size(B),Data Source,Overall,Classification,VQA,Retrieval,Grounding
2
+ clip-vit-large-patch14,0.428,TIGER-Lab,37.8,42.8,9.1,53.0,51.8
3
+ blip2-opt-2.7b,3.74,TIGER-Lab,25.2,27.0,4.2,33.9,47.0
4
+ siglip-base-patch16-224,0.203,TIGER-Lab,34.8,40.3,8.4,31.6,59.5
5
+ open_clip-ViT-L/14,0.428,TIGER-Lab,39.7,47.8,10.9,52.3,53.3
6
+ UniIR (BLIP_FF),0.247,TIGER-Lab,42.8,42.1,15.0,60.1,62.2
7
+ UniIR (CLIP_SF),0.428,TIGER-Lab,44.7,44.3,16.2,61.8,65.3
8
+ e5-v,8.36,TIGER-Lab,13.3,21.8,4.9,11.5,19.0
9
+ Magiclens,0.428,TIGER-Lab,27.8,38.8,8.3,35.4,26.0
10
+ CLIP-FT,0.428,TIGER-Lab,45.4,55.2,19.7,53.2,62.2
11
+ OpenCLIP-FT,0.428,TIGER-Lab,47.2,56.0,21.9,55.4,64.1
12
+ VLM2Vec (Phi-3.5-V-FT),4.15,TIGER-Lab,55.9,52.8,50.3,57.8,72.3
13
+ VLM2Vec (Phi-3.5-V-LoRA),4.15,TIGER-Lab,60.1,54.8,54.9,62.3,79.5
14
+ VLM2Vec (LLaVA-1.6-LoRA-LowRes),7.57,TIGER-Lab,55.0,54.7,50.3,56.2,64.0
15
+ VLM2Vec (LLaVA-1.6-LoRA-HighRes),7.57,TIGER-Lab,62.9,61.2,49.9,67.4,86.1
16
+ MMRet-MLLM (LLaVA-1.6),7.57,Self-Reported,44.0,47.2,18.4,56.5,62.2
17
+ MMRet-MLLM (FT),7.57,Self-Reported,64.1,56.0,57.4,69.9,83.6
18
+ mmE5-mllama-11b-instruct,10.6,Self-Reported,69.8,67.6,62.6,71.0,89.6
19
+ mmE5 (w/ 560K synthetic data),10.6,Self-Reported,58.6,60.6,55.7,54.7,72.4
20
+ MM-Embed,8.18,Self-Reported,50.0,48.1,32.3,63.8,57.8
21
+ gme-Qwen2-VL-2B-Instruct,2.21,Self-Reported,55.8,56.9,41.2,67.8,53.4
22
+ VLM2Vec (Qwen2-VL-7B-LoRA-HighRes),8.29,TIGER-Lab,65.8,62.6,57.8,69.9,81.7
23
+ VLM2Vec (Qwen2-VL-2B-LoRA-HighRes),2.21,TIGER-Lab,59.3,59.0,49.4,65.4,73.4
24
+ LLaVE-7B,8.03,Self-Reported,70.3,65.7,65.4,70.9,91.9
25
+ LLaVE-2B,1.95,Self-Reported,65.2,62.1,60.2,65.2,84.9
26
+ LLaVE-0.5B,0.894,Self-Reported,59.1,57.4,50.3,59.8,82.9
27
+ UniME(LLaVA-OneVision-7B-LoRA-Res336),8.03,Self-Reported,70.7,66.8,66.6,70.5,90.9
28
+ UniME(LLaVA-1.6-7B-LoRA-LowRes),7.57,Self-Reported,66.6,60.6,52.9,67.9,85.1
29
+ UniME(Phi-3.5-V-LoRA),4.2,Self-Reported,64.2,54.8,55.9,64.5,81.8
30
+ QQMM-embed,8.297,Self-Reported,72.175,70.07,69.52,71.175,87.075
results.jsonl DELETED
@@ -1,30 +0,0 @@
1
- {"Models":"B3","Model Size(B)":8.29,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":72.0,"I-CLS":70.0,"I-QA":66.5,"I-RET":74.1,"I-VG":84.6,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/raghavlite\/B3_Qwen2_7B"}
2
- {"Models":"CLIP-FT","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":45.4,"I-CLS":55.2,"I-QA":19.7,"I-RET":53.2,"I-VG":62.2,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/doi.org\/10.48550\/arXiv.2103.00020"}
3
- {"Models":"LLaVE-0.5B","Model Size(B)":0.894,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":59.1,"I-CLS":57.4,"I-QA":50.3,"I-RET":59.8,"I-VG":82.9,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/zhibinlan\/LLaVE-0.5B"}
4
- {"Models":"LLaVE-2B","Model Size(B)":1.95,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":65.2,"I-CLS":62.1,"I-QA":60.2,"I-RET":65.2,"I-VG":84.9,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/zhibinlan\/LLaVE-2B"}
5
- {"Models":"LLaVE-7B","Model Size(B)":8.03,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":70.3,"I-CLS":65.7,"I-QA":65.4,"I-RET":70.9,"I-VG":91.9,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/zhibinlan\/LLaVE-7B"}
6
- {"Models":"MM-Embed","Model Size(B)":8.18,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":50.0,"I-CLS":48.1,"I-QA":32.3,"I-RET":63.8,"I-VG":57.8,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/nvidia\/MM-Embed"}
7
- {"Models":"MMRet-MLLM (FT)","Model Size(B)":7.57,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":64.1,"I-CLS":56.0,"I-QA":57.4,"I-RET":69.9,"I-VG":83.6,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/JUNJIE99\/MMRet-large"}
8
- {"Models":"MMRet-MLLM (LLaVA-1.6)","Model Size(B)":7.57,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":44.0,"I-CLS":47.2,"I-QA":18.4,"I-RET":56.5,"I-VG":62.2,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/JUNJIE99\/MMRet-large"}
9
- {"Models":"Magiclens","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":27.8,"I-CLS":38.8,"I-QA":8.3,"I-RET":35.4,"I-VG":26.0,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/github.com\/google-deepmind\/magiclens"}
10
- {"Models":"OpenCLIP-FT","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":47.2,"I-CLS":56.0,"I-QA":21.9,"I-RET":55.4,"I-VG":64.1,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/doi.org\/10.48550\/arXiv.2212.07143"}
11
- {"Models":"QQMM-embed","Model Size(B)":8.297,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":72.175,"I-CLS":70.07,"I-QA":69.52,"I-RET":71.175,"I-VG":87.075,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/github.com\/QQ-MM\/QQMM-embed"}
12
- {"Models":"UniIR (BLIP_FF)","Model Size(B)":0.247,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":42.8,"I-CLS":42.1,"I-QA":15.0,"I-RET":60.1,"I-VG":62.2,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/UniIR"}
13
- {"Models":"UniIR (CLIP_SF)","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":44.7,"I-CLS":44.3,"I-QA":16.2,"I-RET":61.8,"I-VG":65.3,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/UniIR"}
14
- {"Models":"UniME(LLaVA-1.6-7B-LoRA-LowRes)","Model Size(B)":7.57,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":66.6,"I-CLS":60.6,"I-QA":52.9,"I-RET":67.9,"I-VG":85.1,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/DeepGlint-AI\/UniME-LLaVA-1.6-7B"}
15
- {"Models":"UniME(LLaVA-OneVision-7B-LoRA-Res336)","Model Size(B)":8.03,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":70.7,"I-CLS":66.8,"I-QA":66.6,"I-RET":70.5,"I-VG":90.9,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/DeepGlint-AI\/UniME-LLaVA-OneVision-7B"}
16
- {"Models":"UniME(Phi-3.5-V-LoRA)","Model Size(B)":4.2,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":64.2,"I-CLS":54.8,"I-QA":55.9,"I-RET":64.5,"I-VG":81.8,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/DeepGlint-AI\/UniME-Phi3.5-V-4.2B"}
17
- {"Models":"VLM2Vec (LLaVA-1.6-LoRA-HighRes)","Model Size(B)":7.57,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":62.9,"I-CLS":61.2,"I-QA":49.9,"I-RET":67.4,"I-VG":86.1,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-LLaVa-Next"}
18
- {"Models":"VLM2Vec (LLaVA-1.6-LoRA-LowRes)","Model Size(B)":7.57,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":55.0,"I-CLS":54.7,"I-QA":50.3,"I-RET":56.2,"I-VG":64.0,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-LLaVa-Next"}
19
- {"Models":"VLM2Vec (Phi-3.5-V-FT)","Model Size(B)":4.15,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":55.9,"I-CLS":52.8,"I-QA":50.3,"I-RET":57.8,"I-VG":72.3,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Full"}
20
- {"Models":"VLM2Vec (Phi-3.5-V-LoRA)","Model Size(B)":4.15,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":60.1,"I-CLS":54.8,"I-QA":54.9,"I-RET":62.3,"I-VG":79.5,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Full"}
21
- {"Models":"VLM2Vec (Qwen2-VL-2B-LoRA-HighRes)","Model Size(B)":2.21,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":59.3,"I-CLS":59.0,"I-QA":49.4,"I-RET":65.4,"I-VG":73.4,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Qwen2VL-2B"}
22
- {"Models":"VLM2Vec (Qwen2-VL-7B-LoRA-HighRes)","Model Size(B)":8.29,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":65.8,"I-CLS":62.6,"I-QA":57.8,"I-RET":69.9,"I-VG":81.7,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Qwen2VL-7B"}
23
- {"Models":"blip2-opt-2.7b","Model Size(B)":3.74,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":25.2,"I-CLS":27.0,"I-QA":4.2,"I-RET":33.9,"I-VG":47.0,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/Salesforce\/blip2-opt-2.7b"}
24
- {"Models":"clip-vit-large-patch14","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":37.8,"I-CLS":42.8,"I-QA":9.1,"I-RET":53.0,"I-VG":51.8,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/openai\/clip-vit-large-patch14"}
25
- {"Models":"e5-v","Model Size(B)":8.36,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":13.3,"I-CLS":21.8,"I-QA":4.9,"I-RET":11.5,"I-VG":19.0,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/royokong\/e5-v"}
26
- {"Models":"gme-Qwen2-VL-2B-Instruct","Model Size(B)":2.21,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":55.8,"I-CLS":56.9,"I-QA":41.2,"I-RET":67.8,"I-VG":53.4,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/Alibaba-NLP\/gme-Qwen2-VL-2B-Instruct"}
27
- {"Models":"mmE5 (w\/ 560K synthetic data)","Model Size(B)":10.6,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":58.6,"I-CLS":60.6,"I-QA":55.7,"I-RET":54.7,"I-VG":72.4,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/intfloat\/mmE5-mllama-11b-instruct"}
28
- {"Models":"mmE5-mllama-11b-instruct","Model Size(B)":10.6,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":69.8,"I-CLS":67.6,"I-QA":62.6,"I-RET":71.0,"I-VG":89.6,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/intfloat\/mmE5-mllama-11b-instruct"}
29
- {"Models":"open_clip-ViT-L\/14","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":39.7,"I-CLS":47.8,"I-QA":10.9,"I-RET":52.3,"I-VG":53.3,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/github.com\/mlfoundations\/open_clip"}
30
- {"Models":"siglip-base-patch16-224","Model Size(B)":0.203,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":34.8,"I-CLS":40.3,"I-QA":8.4,"I-RET":31.6,"I-VG":59.5,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/google\/siglip-base-patch16-224"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
urls.csv ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Models,URL
2
+ clip-vit-large-patch14,https://huggingface.co/openai/clip-vit-large-patch14
3
+ blip2-opt-2.7b,https://huggingface.co/Salesforce/blip2-opt-2.7b
4
+ siglip-base-patch16-224,https://huggingface.co/google/siglip-base-patch16-224
5
+ open_clip-ViT-L/14,https://github.com/mlfoundations/open_clip
6
+ e5-v,https://huggingface.co/royokong/e5-v
7
+ Magiclens,https://github.com/google-deepmind/magiclens
8
+ MMRet,https://huggingface.co/JUNJIE99/MMRet-large
9
+ VLM2Vec-Phi-3.5-v,https://huggingface.co/TIGER-Lab/VLM2Vec-Full
10
+ VLM2Vec,https://github.com/TIGER-AI-Lab/VLM2Vec
11
+ VLM2Vec (Qwen2-VL-7B-LoRA-HighRes),https://huggingface.co/TIGER-Lab/VLM2Vec-Qwen2VL-7B
12
+ VLM2Vec (Qwen2-VL-2B-LoRA-HighRes),https://huggingface.co/TIGER-Lab/VLM2Vec-Qwen2VL-2B
13
+ UniIR,https://huggingface.co/TIGER-Lab/UniIR
14
+ OpenCLIP-FT,https://doi.org/10.48550/arXiv.2212.07143
15
+ CLIP-FT,https://doi.org/10.48550/arXiv.2103.00020
16
+ mmE5,https://huggingface.co/intfloat/mmE5-mllama-11b-instruct
17
+ gme-Qwen2-VL-2B-Instruct,https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct
18
+ MM-Embed,https://huggingface.co/nvidia/MM-Embed
19
+ LLaVE-7B,https://huggingface.co/zhibinlan/LLaVE-7B
20
+ LLaVE-2B,https://huggingface.co/zhibinlan/LLaVE-2B
21
+ LLaVE-0.5B,https://huggingface.co/zhibinlan/LLaVE-0.5B
22
+ UniME(LLaVA-OneVision-7B-LoRA-Res336),https://huggingface.co/DeepGlint-AI/UniME-LLaVA-OneVision-7B
23
+ UniME(LLaVA-1.6-7B-LoRA-LowRes),https://huggingface.co/DeepGlint-AI/UniME-LLaVA-1.6-7B
24
+ UniME(Phi-3.5-V-LoRA),https://huggingface.co/DeepGlint-AI/UniME-Phi3.5-V-4.2B
utils.py CHANGED
@@ -10,39 +10,64 @@ from huggingface_hub import Repository
10
 
11
  HF_TOKEN = os.environ.get("HF_TOKEN")
12
 
13
- BASE_COLS = ["Rank", "Models", "Model Size(B)", "Data Source"]
14
- TASKS_V1 = ["V1-Overall", "I-CLS", "I-QA", "I-RET", "I-VG"]
15
- TASKS_V2 = ["V2-Overall", "V-CLS", "V-QA", "V-RET", "V-MRET", "VisDoc"]
16
- COLUMN_NAMES = BASE_COLS + TASKS_V1 + TASKS_V2
17
 
18
- DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown'] + \
19
- ['number'] * len(TASKS_V1 + TASKS_V2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  LEADERBOARD_INTRODUCTION = """
22
- # 📊 **MMEB LEADERBOARD (V1 & V2)**
23
 
24
  ## Introduction
25
- We introduce a novel benchmark, **MMEB-V1 (Massive Multimodal Embedding Benchmark)**,
26
  which includes 36 datasets spanning four meta-task categories: classification, visual question answering, retrieval, and visual grounding. MMEB provides a comprehensive framework for training
27
  and evaluating embedding models across various combinations of text and image modalities.
28
  All tasks are reformulated as ranking tasks, where the model follows instructions, processes a query, and selects the correct target from a set of candidates. The query and target can be an image, text,
29
- or a combination of both. MMEB-V1 is divided into 20 in-distribution datasets, which can be used for
30
  training, and 16 out-of-distribution datasets, reserved for evaluation.
31
 
32
- Building upon on **MMEB-V1**, **MMEB-V2** expands the evaluation scope to include five new tasks: four video-based tasks
33
- Video Retrieval, Moment Retrieval, Video Classification, and Video Question Answering — and one task focused on visual documents, Visual Document Retrieval.
34
- This comprehensive suite enables robust evaluation of multimodal embedding models across static, temporal, and structured visual data settings.
35
-
36
- | [**📈Overview**](https://tiger-ai-lab.github.io/VLM2Vec/) | [**Github**](https://github.com/TIGER-AI-Lab/VLM2Vec)
37
- | [**📖MMEB-V2/VLM2Vec-V2 Paper (TBA)**](https://arxiv.org/abs/2410.05160)
38
- | [**📖MMEB-V1/VLM2Vec-V1 Paper**](https://arxiv.org/abs/2410.05160)
39
- | [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2) |
40
  """
41
 
42
- TABLE_INTRODUCTION = """Models are ranked based on V1-Overall."""
43
 
44
  LEADERBOARD_INFO = """
45
  ## Dataset Summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  """
47
 
48
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
@@ -56,84 +81,112 @@ CITATION_BUTTON_TEXT = r"""@article{jiang2024vlm2vec,
56
  SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
57
 
58
  ## ⚠ Please note that you need to submit the JSON file with the following format:
59
-
60
- ### **TO SUBMIT V1 ONLY**
61
  ```json
62
  [
63
  {
64
  "Model": "<Model Name>",
65
- "URL": "<Model URL>" or null,
66
- "Model Size(B)": 1000 or null,
67
- "Data Source": "Self-Reported",
68
- "V1-Overall": 50.0,
69
- "I-CLS": 50.0,
70
- "I-QA": 50.0,
71
- "I-RET": 50.0,
72
- "I-VG": 50.0
73
  },
74
  ]
75
  ```
76
-
77
- ### **TO SUBMIT V2 ONLY**
78
- ```json
79
- [
80
- {
81
- "Model": "<Model Name>",
82
- "URL": "<Model URL>" or null,
83
- "Model Size(B)": 1000 or null,
84
- "Data Source": "Self-Reported",
85
- "V2-Overall": 50.0,
86
- "V-CLS": 50.0,
87
- "V-QA": 50.0,
88
- "V-RET": 50.0,
89
- "V-MRET": 50.0,
90
- "VisDoc": 50.0
91
- },
92
- ]
93
- ```
94
- You are also welcome to submit both versions by including all the fields above! :) \n
95
- You may refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for instructions about evaluating your model. \n
96
  Please send us an email at [email protected], attaching the JSON file. We will review your submission and update the leaderboard accordingly.
97
  """
98
 
 
 
 
99
  def create_hyperlinked_names(df):
100
  def convert_url(url, model_name):
101
- return f'<a href="{url}">{model_name}</a>' if url is not None else model_name
102
-
103
- def add_link_to_model_name(row):
104
- row['Models'] = convert_url(row['URL'], row['Models'])
105
- return row
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  df = df.copy()
108
- df = df.apply(add_link_to_model_name, axis=1)
109
  return df
110
 
111
- # def fetch_data(file: str) -> pd.DataFrame:
112
- # # fetch the leaderboard data from remote
113
- # if file is None:
114
- # raise ValueError("URL Not Provided")
115
- # url = f"https://huggingface.co/spaces/TIGER-Lab/MMEB/resolve/main/{file}"
116
- # print(f"Fetching data from {url}")
117
- # response = requests.get(url)
118
- # if response.status_code != 200:
119
- # raise requests.HTTPError(f"Failed to fetch data: HTTP status code {response.status_code}")
120
- # return pd.read_json(io.StringIO(response.text), orient='records', lines=True)
121
-
122
- def get_df(file="results.jsonl"):
123
- df = pd.read_json(file, orient='records', lines=True)
124
  df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
125
- for task in TASKS_V1 + TASKS_V2:
126
- if df[task].isnull().any():
127
- df[task] = df[task].apply(lambda score: '-' if pd.isna(score) else score)
128
- df = df.sort_values(by=['V1-Overall'], ascending=False)
129
  df = create_hyperlinked_names(df)
130
  df['Rank'] = range(1, len(df) + 1)
131
  return df
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  def refresh_data():
134
  df = get_df()
135
  return df[COLUMN_NAMES]
136
 
 
137
  def search_and_filter_models(df, query, min_size, max_size):
138
  filtered_df = df.copy()
139
 
@@ -170,6 +223,7 @@ def process_model_size(size):
170
  except (ValueError, TypeError):
171
  return 'unknown'
172
 
 
173
  def filter_columns_by_tasks(df, selected_tasks=None):
174
  if selected_tasks is None or len(selected_tasks) == 0:
175
  return df[COLUMN_NAMES]
@@ -179,3 +233,7 @@ def filter_columns_by_tasks(df, selected_tasks=None):
179
 
180
  available_columns = [col for col in selected_columns if col in df.columns]
181
  return df[available_columns]
 
 
 
 
 
10
 
11
  HF_TOKEN = os.environ.get("HF_TOKEN")
12
 
13
+ TASKS = ["Classification", "VQA", "Retrieval", "Grounding"]
 
 
 
14
 
15
+ MODEL_INFO = [
16
+ "Rank", "Models", "Model Size(B)", "Data Source",
17
+ "Overall",
18
+ "Classification", "VQA", "Retrieval", "Grounding"
19
+ ]
20
+
21
+ BASE_COLS = [col for col in MODEL_INFO if col not in TASKS]
22
+
23
+ DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
24
+
25
+ SUBMISSION_NAME = "MMEB"
26
+ SUBMISSION_URL = os.path.join("https://huggingface.co/spaces/TIGER-Lab/", SUBMISSION_NAME)
27
+ FILE_NAME = "results.csv"
28
+ CSV_DIR = "./results.csv"
29
+
30
+ COLUMN_NAMES = MODEL_INFO
31
 
32
  LEADERBOARD_INTRODUCTION = """
33
+ # MMEB Leaderboard
34
 
35
  ## Introduction
36
+ We introduce a novel benchmark, MMEB (Massive Multimodal Embedding Benchmark),
37
  which includes 36 datasets spanning four meta-task categories: classification, visual question answering, retrieval, and visual grounding. MMEB provides a comprehensive framework for training
38
  and evaluating embedding models across various combinations of text and image modalities.
39
  All tasks are reformulated as ranking tasks, where the model follows instructions, processes a query, and selects the correct target from a set of candidates. The query and target can be an image, text,
40
+ or a combination of both. MMEB is divided into 20 in-distribution datasets, which can be used for
41
  training, and 16 out-of-distribution datasets, reserved for evaluation.
42
 
43
+ The detailed explanation of the benchmark and datasets can be found in our paper: https://doi.org/10.48550/arXiv.2410.05160. \n
44
+ Github link: https://github.com/TIGER-AI-Lab/VLM2Vec. \n
45
+ Overview: https://tiger-ai-lab.github.io/VLM2Vec/. \n
 
 
 
 
 
46
  """
47
 
48
+ TABLE_INTRODUCTION = """"""
49
 
50
  LEADERBOARD_INFO = """
51
  ## Dataset Summary
52
+ MMEB is organized into four primary meta-task categories:
53
+ - **Classification**: This category comprises 5 in-distribution and 5 out-of-distribution datasets. Queries
54
+ consist of instructions and images, optionally accompanied by related text. Targets are class labels,
55
+ and the number of class labels corresponds to the number of classes in the dataset. \n
56
+ - IND: ImageNet-1k, N24News, HatefulMemes, VOC2007, SUN397 \n
57
+ - OOD: Place365, ImageNet-A, ImageNet-R, ObjectNet, Country-211 \n
58
+ - **Visual Question Answering**: This category includes 6 in-distribution and 4 out-of-distribution
59
+ datasets. The query consists of an instruction, an image, and a piece of text as the question, while
60
+ the target is the answer. Each query has 1,000 target candidates: 1 ground truth and 999 distractors. \n
61
+ - IND: OK-VQA, A-OKVQA, DocVQA, InfographicVQA, ChartQA, Visual7W \n
62
+ - OOD: ScienceQA, VizWiz, GQA, TextVQA \n
63
+ - **Information Retrieval**: This category contains 8 in-distribution and 4 out-of-distribution datasets.
64
+ Both the query and target sides can involve a combination of text, images, and instructions. Similar
65
+ to the VQA task, each query has 1,000 candidates, with 1 ground truth and 999 distractors. \n
66
+ - IND: VisDial, CIRR, VisualNews_t2i, VisualNews_i2t, MSCOCO_t2i, MSCOCO_i2t, NIGHTS, WebQA \n
67
+ - OOD: OVEN, FashionIQ, EDIS, Wiki-SS-NQ \n
68
+ - **Visual Grounding**: This category includes 1 in-distribution and 3 out-of-distribution datasets, which are adapted from object detection tasks. Queries consist of an instruction, an image, and text referring to a specific region or object within the image. The target may include a cropped image of the object or text describing the same region. Each query includes 1,000 candidates: 1 ground truth and 999 distractors. These distractors may include hard negatives from the same object class, other objects in the image, or random objects from different images. \n
69
+ - IND: MSCOCO \n
70
+ - OOD: Visual7W-Pointing, RefCOCO, RefCOCO-Matching \n
71
  """
72
 
73
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
81
  SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
82
 
83
  ## ⚠ Please note that you need to submit the JSON file with the following format:
 
 
84
  ```json
85
  [
86
  {
87
  "Model": "<Model Name>",
88
+ <Optional>"URL": "<Model URL>",
89
+ "Model Size(B)": 1000,
90
+ "Data Source": Self-Reported,
91
+ "Overall": 50.0,
92
+ "Classification": 50.0,
93
+ "VQA": 50.0,
94
+ "Retrieval": 50.0,
95
+ "Grounding": 50.0
96
  },
97
  ]
98
  ```
99
+ You may refer to the Github page for instructions about evaluating your model.
100
+ Github link: https://github.com/TIGER-AI-Lab/VLM2Vec. \n
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  Please send us an email at [email protected], attaching the JSON file. We will review your submission and update the leaderboard accordingly.
102
  """
103
 
104
+ MODEL_URLS = pd.read_csv("urls.csv")
105
+ MODEL_URLS = dict(zip(MODEL_URLS['Models'], MODEL_URLS['URL']))
106
+
107
  def create_hyperlinked_names(df):
108
  def convert_url(url, model_name):
109
+ return f'<a href="{url}">{model_name}</a>'
110
+
111
+ def add_link_to_model_name(model_name):
112
+ if "VLM2Vec (Phi-3.5-V-" in model_name:
113
+ url = MODEL_URLS["VLM2Vec-Phi-3.5-v"]
114
+ return convert_url(url, model_name)
115
+ if "VLM2Vec (LLaVA-1.6-LoRA-" in model_name:
116
+ url = MODEL_URLS["VLM2Vec"]
117
+ return convert_url(url, model_name)
118
+ if "UniIR" in model_name:
119
+ url = MODEL_URLS["UniIR"]
120
+ return convert_url(url, model_name)
121
+ if "mmE5" in model_name:
122
+ url = MODEL_URLS["mmE5"]
123
+ return convert_url(url, model_name)
124
+ if "MMRet" in model_name:
125
+ url = MODEL_URLS["MMRet"]
126
+ return convert_url(url, model_name)
127
+ return convert_url(MODEL_URLS[model_name], model_name) if model_name in MODEL_URLS else model_name
128
 
129
  df = df.copy()
130
+ df['Models'] = df['Models'].apply(add_link_to_model_name)
131
  return df
132
 
133
+ def get_df():
134
+ # fetch the leaderboard data
135
+ url = "https://huggingface.co/spaces/TIGER-Lab/MMEB/resolve/main/results.csv"
136
+ response = requests.get(url, headers={"Authorization": f"Bearer {HF_TOKEN}"})
137
+ if response.status_code != 200:
138
+ import sys
139
+ sys.exit(f"Error: {response.status_code}")
140
+ df = pd.read_csv(io.StringIO(response.text))
141
+ df.to_csv(CSV_DIR, index=False) # update local file
 
 
 
 
142
  df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
143
+ df = df.sort_values(by=['Overall'], ascending=False)
 
 
 
144
  df = create_hyperlinked_names(df)
145
  df['Rank'] = range(1, len(df) + 1)
146
  return df
147
 
148
+
149
+ def add_new_eval(input_file):
150
+ if input_file is None:
151
+ return "Error! Empty file!"
152
+
153
+ # Load the input json file
154
+ upload_data = json.loads(input_file)
155
+ print("upload_data:\n", upload_data)
156
+ data_row = [f'{upload_data["Model"]}']
157
+ for col in ['Overall', 'Model Size(B)'] + TASKS:
158
+ if not col in upload_data.keys():
159
+ return f"Error! Missing {col} column!"
160
+ data_row += [upload_data[col]]
161
+ if 'URL' in upload_data.keys():
162
+ MODEL_URLS[upload_data['Model']] = upload_data['URL']
163
+ print("data_row:\n", data_row)
164
+ submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
165
+ use_auth_token=HF_TOKEN, repo_type="space")
166
+ submission_repo.git_pull()
167
+
168
+ # Track submitted models
169
+ already_submitted = []
170
+ with open(CSV_DIR, mode='r') as file:
171
+ reader = csv.reader(file, delimiter=',')
172
+ for row in reader:
173
+ already_submitted.append(row[0])
174
+ # if not in the existing models list, add it to the csv file
175
+ if data_row[0] not in already_submitted:
176
+ with open(CSV_DIR, mode='a', newline='') as file:
177
+ writer = csv.writer(file)
178
+ writer.writerow(data_row)
179
+
180
+ submission_repo.push_to_hub()
181
+ print('Submission Successful')
182
+ else:
183
+ print('The model already exists in the leaderboard!')
184
+
185
  def refresh_data():
186
  df = get_df()
187
  return df[COLUMN_NAMES]
188
 
189
+
190
  def search_and_filter_models(df, query, min_size, max_size):
191
  filtered_df = df.copy()
192
 
 
223
  except (ValueError, TypeError):
224
  return 'unknown'
225
 
226
+
227
  def filter_columns_by_tasks(df, selected_tasks=None):
228
  if selected_tasks is None or len(selected_tasks) == 0:
229
  return df[COLUMN_NAMES]
 
233
 
234
  available_columns = [col for col in selected_columns if col in df.columns]
235
  return df[available_columns]
236
+
237
+ def get_task_choices():
238
+ return TASKS
239
+