Spaces:
Running
Running
add self upload feature
#36
by
MINGYISU
- opened
- .gitattributes +0 -1
- .gitignore +0 -2
- app.py +4 -4
- overview.png +0 -3
- results.csv +30 -0
- results.jsonl +0 -30
- urls.csv +24 -0
- utils.py +129 -71
.gitattributes
CHANGED
@@ -33,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
-
overview.png filter=lfs diff=lfs merge=lfs -text
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
.gitignore
CHANGED
@@ -11,5 +11,3 @@ eval-results/
|
|
11 |
eval-queue-bk/
|
12 |
eval-results-bk/
|
13 |
logs/
|
14 |
-
.gitignore
|
15 |
-
.gradio
|
|
|
11 |
eval-queue-bk/
|
12 |
eval-results-bk/
|
13 |
logs/
|
|
|
|
app.py
CHANGED
@@ -52,10 +52,11 @@ with gr.Blocks() as block:
|
|
52 |
label="Maximum number of parameters (B)",
|
53 |
)
|
54 |
|
|
|
55 |
with gr.Row():
|
56 |
tasks_select = gr.CheckboxGroup(
|
57 |
-
choices=
|
58 |
-
value=
|
59 |
label="Select tasks to Display",
|
60 |
elem_id="tasks-select"
|
61 |
)
|
@@ -100,11 +101,10 @@ with gr.Blocks() as block:
|
|
100 |
# table 2
|
101 |
with gr.TabItem("📝 About", elem_id="qa-tab-table2", id=2):
|
102 |
gr.Markdown(LEADERBOARD_INFO, elem_classes="markdown-text")
|
103 |
-
gr.Image("overview.png", width=900, label="Dataset Overview")
|
104 |
|
105 |
# table 3
|
106 |
with gr.TabItem("🚀 Submit here! ", elem_id="submit-tab", id=3):
|
107 |
with gr.Row():
|
108 |
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
|
109 |
|
110 |
-
block.launch()
|
|
|
52 |
label="Maximum number of parameters (B)",
|
53 |
)
|
54 |
|
55 |
+
task_choices = [col for col in COLUMN_NAMES if col not in BASE_COLS]
|
56 |
with gr.Row():
|
57 |
tasks_select = gr.CheckboxGroup(
|
58 |
+
choices=task_choices,
|
59 |
+
value=task_choices,
|
60 |
label="Select tasks to Display",
|
61 |
elem_id="tasks-select"
|
62 |
)
|
|
|
101 |
# table 2
|
102 |
with gr.TabItem("📝 About", elem_id="qa-tab-table2", id=2):
|
103 |
gr.Markdown(LEADERBOARD_INFO, elem_classes="markdown-text")
|
|
|
104 |
|
105 |
# table 3
|
106 |
with gr.TabItem("🚀 Submit here! ", elem_id="submit-tab", id=3):
|
107 |
with gr.Row():
|
108 |
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
|
109 |
|
110 |
+
block.launch(share=True)
|
overview.png
DELETED
Git LFS Details
|
results.csv
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Models,Model Size(B),Data Source,Overall,Classification,VQA,Retrieval,Grounding
|
2 |
+
clip-vit-large-patch14,0.428,TIGER-Lab,37.8,42.8,9.1,53.0,51.8
|
3 |
+
blip2-opt-2.7b,3.74,TIGER-Lab,25.2,27.0,4.2,33.9,47.0
|
4 |
+
siglip-base-patch16-224,0.203,TIGER-Lab,34.8,40.3,8.4,31.6,59.5
|
5 |
+
open_clip-ViT-L/14,0.428,TIGER-Lab,39.7,47.8,10.9,52.3,53.3
|
6 |
+
UniIR (BLIP_FF),0.247,TIGER-Lab,42.8,42.1,15.0,60.1,62.2
|
7 |
+
UniIR (CLIP_SF),0.428,TIGER-Lab,44.7,44.3,16.2,61.8,65.3
|
8 |
+
e5-v,8.36,TIGER-Lab,13.3,21.8,4.9,11.5,19.0
|
9 |
+
Magiclens,0.428,TIGER-Lab,27.8,38.8,8.3,35.4,26.0
|
10 |
+
CLIP-FT,0.428,TIGER-Lab,45.4,55.2,19.7,53.2,62.2
|
11 |
+
OpenCLIP-FT,0.428,TIGER-Lab,47.2,56.0,21.9,55.4,64.1
|
12 |
+
VLM2Vec (Phi-3.5-V-FT),4.15,TIGER-Lab,55.9,52.8,50.3,57.8,72.3
|
13 |
+
VLM2Vec (Phi-3.5-V-LoRA),4.15,TIGER-Lab,60.1,54.8,54.9,62.3,79.5
|
14 |
+
VLM2Vec (LLaVA-1.6-LoRA-LowRes),7.57,TIGER-Lab,55.0,54.7,50.3,56.2,64.0
|
15 |
+
VLM2Vec (LLaVA-1.6-LoRA-HighRes),7.57,TIGER-Lab,62.9,61.2,49.9,67.4,86.1
|
16 |
+
MMRet-MLLM (LLaVA-1.6),7.57,Self-Reported,44.0,47.2,18.4,56.5,62.2
|
17 |
+
MMRet-MLLM (FT),7.57,Self-Reported,64.1,56.0,57.4,69.9,83.6
|
18 |
+
mmE5-mllama-11b-instruct,10.6,Self-Reported,69.8,67.6,62.6,71.0,89.6
|
19 |
+
mmE5 (w/ 560K synthetic data),10.6,Self-Reported,58.6,60.6,55.7,54.7,72.4
|
20 |
+
MM-Embed,8.18,Self-Reported,50.0,48.1,32.3,63.8,57.8
|
21 |
+
gme-Qwen2-VL-2B-Instruct,2.21,Self-Reported,55.8,56.9,41.2,67.8,53.4
|
22 |
+
VLM2Vec (Qwen2-VL-7B-LoRA-HighRes),8.29,TIGER-Lab,65.8,62.6,57.8,69.9,81.7
|
23 |
+
VLM2Vec (Qwen2-VL-2B-LoRA-HighRes),2.21,TIGER-Lab,59.3,59.0,49.4,65.4,73.4
|
24 |
+
LLaVE-7B,8.03,Self-Reported,70.3,65.7,65.4,70.9,91.9
|
25 |
+
LLaVE-2B,1.95,Self-Reported,65.2,62.1,60.2,65.2,84.9
|
26 |
+
LLaVE-0.5B,0.894,Self-Reported,59.1,57.4,50.3,59.8,82.9
|
27 |
+
UniME(LLaVA-OneVision-7B-LoRA-Res336),8.03,Self-Reported,70.7,66.8,66.6,70.5,90.9
|
28 |
+
UniME(LLaVA-1.6-7B-LoRA-LowRes),7.57,Self-Reported,66.6,60.6,52.9,67.9,85.1
|
29 |
+
UniME(Phi-3.5-V-LoRA),4.2,Self-Reported,64.2,54.8,55.9,64.5,81.8
|
30 |
+
QQMM-embed,8.297,Self-Reported,72.175,70.07,69.52,71.175,87.075
|
results.jsonl
DELETED
@@ -1,30 +0,0 @@
|
|
1 |
-
{"Models":"B3","Model Size(B)":8.29,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":72.0,"I-CLS":70.0,"I-QA":66.5,"I-RET":74.1,"I-VG":84.6,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/raghavlite\/B3_Qwen2_7B"}
|
2 |
-
{"Models":"CLIP-FT","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":45.4,"I-CLS":55.2,"I-QA":19.7,"I-RET":53.2,"I-VG":62.2,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/doi.org\/10.48550\/arXiv.2103.00020"}
|
3 |
-
{"Models":"LLaVE-0.5B","Model Size(B)":0.894,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":59.1,"I-CLS":57.4,"I-QA":50.3,"I-RET":59.8,"I-VG":82.9,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/zhibinlan\/LLaVE-0.5B"}
|
4 |
-
{"Models":"LLaVE-2B","Model Size(B)":1.95,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":65.2,"I-CLS":62.1,"I-QA":60.2,"I-RET":65.2,"I-VG":84.9,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/zhibinlan\/LLaVE-2B"}
|
5 |
-
{"Models":"LLaVE-7B","Model Size(B)":8.03,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":70.3,"I-CLS":65.7,"I-QA":65.4,"I-RET":70.9,"I-VG":91.9,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/zhibinlan\/LLaVE-7B"}
|
6 |
-
{"Models":"MM-Embed","Model Size(B)":8.18,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":50.0,"I-CLS":48.1,"I-QA":32.3,"I-RET":63.8,"I-VG":57.8,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/nvidia\/MM-Embed"}
|
7 |
-
{"Models":"MMRet-MLLM (FT)","Model Size(B)":7.57,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":64.1,"I-CLS":56.0,"I-QA":57.4,"I-RET":69.9,"I-VG":83.6,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/JUNJIE99\/MMRet-large"}
|
8 |
-
{"Models":"MMRet-MLLM (LLaVA-1.6)","Model Size(B)":7.57,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":44.0,"I-CLS":47.2,"I-QA":18.4,"I-RET":56.5,"I-VG":62.2,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/JUNJIE99\/MMRet-large"}
|
9 |
-
{"Models":"Magiclens","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":27.8,"I-CLS":38.8,"I-QA":8.3,"I-RET":35.4,"I-VG":26.0,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/github.com\/google-deepmind\/magiclens"}
|
10 |
-
{"Models":"OpenCLIP-FT","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":47.2,"I-CLS":56.0,"I-QA":21.9,"I-RET":55.4,"I-VG":64.1,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/doi.org\/10.48550\/arXiv.2212.07143"}
|
11 |
-
{"Models":"QQMM-embed","Model Size(B)":8.297,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":72.175,"I-CLS":70.07,"I-QA":69.52,"I-RET":71.175,"I-VG":87.075,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/github.com\/QQ-MM\/QQMM-embed"}
|
12 |
-
{"Models":"UniIR (BLIP_FF)","Model Size(B)":0.247,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":42.8,"I-CLS":42.1,"I-QA":15.0,"I-RET":60.1,"I-VG":62.2,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/UniIR"}
|
13 |
-
{"Models":"UniIR (CLIP_SF)","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":44.7,"I-CLS":44.3,"I-QA":16.2,"I-RET":61.8,"I-VG":65.3,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/UniIR"}
|
14 |
-
{"Models":"UniME(LLaVA-1.6-7B-LoRA-LowRes)","Model Size(B)":7.57,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":66.6,"I-CLS":60.6,"I-QA":52.9,"I-RET":67.9,"I-VG":85.1,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/DeepGlint-AI\/UniME-LLaVA-1.6-7B"}
|
15 |
-
{"Models":"UniME(LLaVA-OneVision-7B-LoRA-Res336)","Model Size(B)":8.03,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":70.7,"I-CLS":66.8,"I-QA":66.6,"I-RET":70.5,"I-VG":90.9,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/DeepGlint-AI\/UniME-LLaVA-OneVision-7B"}
|
16 |
-
{"Models":"UniME(Phi-3.5-V-LoRA)","Model Size(B)":4.2,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":64.2,"I-CLS":54.8,"I-QA":55.9,"I-RET":64.5,"I-VG":81.8,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/DeepGlint-AI\/UniME-Phi3.5-V-4.2B"}
|
17 |
-
{"Models":"VLM2Vec (LLaVA-1.6-LoRA-HighRes)","Model Size(B)":7.57,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":62.9,"I-CLS":61.2,"I-QA":49.9,"I-RET":67.4,"I-VG":86.1,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-LLaVa-Next"}
|
18 |
-
{"Models":"VLM2Vec (LLaVA-1.6-LoRA-LowRes)","Model Size(B)":7.57,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":55.0,"I-CLS":54.7,"I-QA":50.3,"I-RET":56.2,"I-VG":64.0,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-LLaVa-Next"}
|
19 |
-
{"Models":"VLM2Vec (Phi-3.5-V-FT)","Model Size(B)":4.15,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":55.9,"I-CLS":52.8,"I-QA":50.3,"I-RET":57.8,"I-VG":72.3,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Full"}
|
20 |
-
{"Models":"VLM2Vec (Phi-3.5-V-LoRA)","Model Size(B)":4.15,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":60.1,"I-CLS":54.8,"I-QA":54.9,"I-RET":62.3,"I-VG":79.5,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Full"}
|
21 |
-
{"Models":"VLM2Vec (Qwen2-VL-2B-LoRA-HighRes)","Model Size(B)":2.21,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":59.3,"I-CLS":59.0,"I-QA":49.4,"I-RET":65.4,"I-VG":73.4,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Qwen2VL-2B"}
|
22 |
-
{"Models":"VLM2Vec (Qwen2-VL-7B-LoRA-HighRes)","Model Size(B)":8.29,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":65.8,"I-CLS":62.6,"I-QA":57.8,"I-RET":69.9,"I-VG":81.7,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/TIGER-Lab\/VLM2Vec-Qwen2VL-7B"}
|
23 |
-
{"Models":"blip2-opt-2.7b","Model Size(B)":3.74,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":25.2,"I-CLS":27.0,"I-QA":4.2,"I-RET":33.9,"I-VG":47.0,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/Salesforce\/blip2-opt-2.7b"}
|
24 |
-
{"Models":"clip-vit-large-patch14","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":37.8,"I-CLS":42.8,"I-QA":9.1,"I-RET":53.0,"I-VG":51.8,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/openai\/clip-vit-large-patch14"}
|
25 |
-
{"Models":"e5-v","Model Size(B)":8.36,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":13.3,"I-CLS":21.8,"I-QA":4.9,"I-RET":11.5,"I-VG":19.0,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/royokong\/e5-v"}
|
26 |
-
{"Models":"gme-Qwen2-VL-2B-Instruct","Model Size(B)":2.21,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":55.8,"I-CLS":56.9,"I-QA":41.2,"I-RET":67.8,"I-VG":53.4,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/Alibaba-NLP\/gme-Qwen2-VL-2B-Instruct"}
|
27 |
-
{"Models":"mmE5 (w\/ 560K synthetic data)","Model Size(B)":10.6,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":58.6,"I-CLS":60.6,"I-QA":55.7,"I-RET":54.7,"I-VG":72.4,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/intfloat\/mmE5-mllama-11b-instruct"}
|
28 |
-
{"Models":"mmE5-mllama-11b-instruct","Model Size(B)":10.6,"Data Source":"Self-Reported","V2-Overall":null,"V1-Overall":69.8,"I-CLS":67.6,"I-QA":62.6,"I-RET":71.0,"I-VG":89.6,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/intfloat\/mmE5-mllama-11b-instruct"}
|
29 |
-
{"Models":"open_clip-ViT-L\/14","Model Size(B)":0.428,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":39.7,"I-CLS":47.8,"I-QA":10.9,"I-RET":52.3,"I-VG":53.3,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/github.com\/mlfoundations\/open_clip"}
|
30 |
-
{"Models":"siglip-base-patch16-224","Model Size(B)":0.203,"Data Source":"TIGER-Lab","V2-Overall":null,"V1-Overall":34.8,"I-CLS":40.3,"I-QA":8.4,"I-RET":31.6,"I-VG":59.5,"V-CLS":null,"V-QA":null,"V-RET":null,"V-MRET":null,"VisDoc":null,"URL":"https:\/\/huggingface.co\/google\/siglip-base-patch16-224"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
urls.csv
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Models,URL
|
2 |
+
clip-vit-large-patch14,https://huggingface.co/openai/clip-vit-large-patch14
|
3 |
+
blip2-opt-2.7b,https://huggingface.co/Salesforce/blip2-opt-2.7b
|
4 |
+
siglip-base-patch16-224,https://huggingface.co/google/siglip-base-patch16-224
|
5 |
+
open_clip-ViT-L/14,https://github.com/mlfoundations/open_clip
|
6 |
+
e5-v,https://huggingface.co/royokong/e5-v
|
7 |
+
Magiclens,https://github.com/google-deepmind/magiclens
|
8 |
+
MMRet,https://huggingface.co/JUNJIE99/MMRet-large
|
9 |
+
VLM2Vec-Phi-3.5-v,https://huggingface.co/TIGER-Lab/VLM2Vec-Full
|
10 |
+
VLM2Vec,https://github.com/TIGER-AI-Lab/VLM2Vec
|
11 |
+
VLM2Vec (Qwen2-VL-7B-LoRA-HighRes),https://huggingface.co/TIGER-Lab/VLM2Vec-Qwen2VL-7B
|
12 |
+
VLM2Vec (Qwen2-VL-2B-LoRA-HighRes),https://huggingface.co/TIGER-Lab/VLM2Vec-Qwen2VL-2B
|
13 |
+
UniIR,https://huggingface.co/TIGER-Lab/UniIR
|
14 |
+
OpenCLIP-FT,https://doi.org/10.48550/arXiv.2212.07143
|
15 |
+
CLIP-FT,https://doi.org/10.48550/arXiv.2103.00020
|
16 |
+
mmE5,https://huggingface.co/intfloat/mmE5-mllama-11b-instruct
|
17 |
+
gme-Qwen2-VL-2B-Instruct,https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct
|
18 |
+
MM-Embed,https://huggingface.co/nvidia/MM-Embed
|
19 |
+
LLaVE-7B,https://huggingface.co/zhibinlan/LLaVE-7B
|
20 |
+
LLaVE-2B,https://huggingface.co/zhibinlan/LLaVE-2B
|
21 |
+
LLaVE-0.5B,https://huggingface.co/zhibinlan/LLaVE-0.5B
|
22 |
+
UniME(LLaVA-OneVision-7B-LoRA-Res336),https://huggingface.co/DeepGlint-AI/UniME-LLaVA-OneVision-7B
|
23 |
+
UniME(LLaVA-1.6-7B-LoRA-LowRes),https://huggingface.co/DeepGlint-AI/UniME-LLaVA-1.6-7B
|
24 |
+
UniME(Phi-3.5-V-LoRA),https://huggingface.co/DeepGlint-AI/UniME-Phi3.5-V-4.2B
|
utils.py
CHANGED
@@ -10,39 +10,64 @@ from huggingface_hub import Repository
|
|
10 |
|
11 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
12 |
|
13 |
-
|
14 |
-
TASKS_V1 = ["V1-Overall", "I-CLS", "I-QA", "I-RET", "I-VG"]
|
15 |
-
TASKS_V2 = ["V2-Overall", "V-CLS", "V-QA", "V-RET", "V-MRET", "VisDoc"]
|
16 |
-
COLUMN_NAMES = BASE_COLS + TASKS_V1 + TASKS_V2
|
17 |
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
LEADERBOARD_INTRODUCTION = """
|
22 |
-
#
|
23 |
|
24 |
## Introduction
|
25 |
-
We introduce a novel benchmark,
|
26 |
which includes 36 datasets spanning four meta-task categories: classification, visual question answering, retrieval, and visual grounding. MMEB provides a comprehensive framework for training
|
27 |
and evaluating embedding models across various combinations of text and image modalities.
|
28 |
All tasks are reformulated as ranking tasks, where the model follows instructions, processes a query, and selects the correct target from a set of candidates. The query and target can be an image, text,
|
29 |
-
or a combination of both. MMEB
|
30 |
training, and 16 out-of-distribution datasets, reserved for evaluation.
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
| [**📈Overview**](https://tiger-ai-lab.github.io/VLM2Vec/) | [**Github**](https://github.com/TIGER-AI-Lab/VLM2Vec)
|
37 |
-
| [**📖MMEB-V2/VLM2Vec-V2 Paper (TBA)**](https://arxiv.org/abs/2410.05160)
|
38 |
-
| [**📖MMEB-V1/VLM2Vec-V1 Paper**](https://arxiv.org/abs/2410.05160)
|
39 |
-
| [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2) |
|
40 |
"""
|
41 |
|
42 |
-
TABLE_INTRODUCTION = """
|
43 |
|
44 |
LEADERBOARD_INFO = """
|
45 |
## Dataset Summary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
"""
|
47 |
|
48 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
@@ -56,84 +81,112 @@ CITATION_BUTTON_TEXT = r"""@article{jiang2024vlm2vec,
|
|
56 |
SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
|
57 |
|
58 |
## ⚠ Please note that you need to submit the JSON file with the following format:
|
59 |
-
|
60 |
-
### **TO SUBMIT V1 ONLY**
|
61 |
```json
|
62 |
[
|
63 |
{
|
64 |
"Model": "<Model Name>",
|
65 |
-
"URL": "<Model URL>"
|
66 |
-
"Model Size(B)": 1000
|
67 |
-
"Data Source":
|
68 |
-
"
|
69 |
-
"
|
70 |
-
"
|
71 |
-
"
|
72 |
-
"
|
73 |
},
|
74 |
]
|
75 |
```
|
76 |
-
|
77 |
-
|
78 |
-
```json
|
79 |
-
[
|
80 |
-
{
|
81 |
-
"Model": "<Model Name>",
|
82 |
-
"URL": "<Model URL>" or null,
|
83 |
-
"Model Size(B)": 1000 or null,
|
84 |
-
"Data Source": "Self-Reported",
|
85 |
-
"V2-Overall": 50.0,
|
86 |
-
"V-CLS": 50.0,
|
87 |
-
"V-QA": 50.0,
|
88 |
-
"V-RET": 50.0,
|
89 |
-
"V-MRET": 50.0,
|
90 |
-
"VisDoc": 50.0
|
91 |
-
},
|
92 |
-
]
|
93 |
-
```
|
94 |
-
You are also welcome to submit both versions by including all the fields above! :) \n
|
95 |
-
You may refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for instructions about evaluating your model. \n
|
96 |
Please send us an email at [email protected], attaching the JSON file. We will review your submission and update the leaderboard accordingly.
|
97 |
"""
|
98 |
|
|
|
|
|
|
|
99 |
def create_hyperlinked_names(df):
|
100 |
def convert_url(url, model_name):
|
101 |
-
return f'<a href="{url}">{model_name}</a>'
|
102 |
-
|
103 |
-
def add_link_to_model_name(
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
df = df.copy()
|
108 |
-
df = df.apply(add_link_to_model_name
|
109 |
return df
|
110 |
|
111 |
-
|
112 |
-
#
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
# return pd.read_json(io.StringIO(response.text), orient='records', lines=True)
|
121 |
-
|
122 |
-
def get_df(file="results.jsonl"):
|
123 |
-
df = pd.read_json(file, orient='records', lines=True)
|
124 |
df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
|
125 |
-
|
126 |
-
if df[task].isnull().any():
|
127 |
-
df[task] = df[task].apply(lambda score: '-' if pd.isna(score) else score)
|
128 |
-
df = df.sort_values(by=['V1-Overall'], ascending=False)
|
129 |
df = create_hyperlinked_names(df)
|
130 |
df['Rank'] = range(1, len(df) + 1)
|
131 |
return df
|
132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
def refresh_data():
|
134 |
df = get_df()
|
135 |
return df[COLUMN_NAMES]
|
136 |
|
|
|
137 |
def search_and_filter_models(df, query, min_size, max_size):
|
138 |
filtered_df = df.copy()
|
139 |
|
@@ -170,6 +223,7 @@ def process_model_size(size):
|
|
170 |
except (ValueError, TypeError):
|
171 |
return 'unknown'
|
172 |
|
|
|
173 |
def filter_columns_by_tasks(df, selected_tasks=None):
|
174 |
if selected_tasks is None or len(selected_tasks) == 0:
|
175 |
return df[COLUMN_NAMES]
|
@@ -179,3 +233,7 @@ def filter_columns_by_tasks(df, selected_tasks=None):
|
|
179 |
|
180 |
available_columns = [col for col in selected_columns if col in df.columns]
|
181 |
return df[available_columns]
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
12 |
|
13 |
+
TASKS = ["Classification", "VQA", "Retrieval", "Grounding"]
|
|
|
|
|
|
|
14 |
|
15 |
+
MODEL_INFO = [
|
16 |
+
"Rank", "Models", "Model Size(B)", "Data Source",
|
17 |
+
"Overall",
|
18 |
+
"Classification", "VQA", "Retrieval", "Grounding"
|
19 |
+
]
|
20 |
+
|
21 |
+
BASE_COLS = [col for col in MODEL_INFO if col not in TASKS]
|
22 |
+
|
23 |
+
DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
|
24 |
+
|
25 |
+
SUBMISSION_NAME = "MMEB"
|
26 |
+
SUBMISSION_URL = os.path.join("https://huggingface.co/spaces/TIGER-Lab/", SUBMISSION_NAME)
|
27 |
+
FILE_NAME = "results.csv"
|
28 |
+
CSV_DIR = "./results.csv"
|
29 |
+
|
30 |
+
COLUMN_NAMES = MODEL_INFO
|
31 |
|
32 |
LEADERBOARD_INTRODUCTION = """
|
33 |
+
# MMEB Leaderboard
|
34 |
|
35 |
## Introduction
|
36 |
+
We introduce a novel benchmark, MMEB (Massive Multimodal Embedding Benchmark),
|
37 |
which includes 36 datasets spanning four meta-task categories: classification, visual question answering, retrieval, and visual grounding. MMEB provides a comprehensive framework for training
|
38 |
and evaluating embedding models across various combinations of text and image modalities.
|
39 |
All tasks are reformulated as ranking tasks, where the model follows instructions, processes a query, and selects the correct target from a set of candidates. The query and target can be an image, text,
|
40 |
+
or a combination of both. MMEB is divided into 20 in-distribution datasets, which can be used for
|
41 |
training, and 16 out-of-distribution datasets, reserved for evaluation.
|
42 |
|
43 |
+
The detailed explanation of the benchmark and datasets can be found in our paper: https://doi.org/10.48550/arXiv.2410.05160. \n
|
44 |
+
Github link: https://github.com/TIGER-AI-Lab/VLM2Vec. \n
|
45 |
+
Overview: https://tiger-ai-lab.github.io/VLM2Vec/. \n
|
|
|
|
|
|
|
|
|
|
|
46 |
"""
|
47 |
|
48 |
+
TABLE_INTRODUCTION = """"""
|
49 |
|
50 |
LEADERBOARD_INFO = """
|
51 |
## Dataset Summary
|
52 |
+
MMEB is organized into four primary meta-task categories:
|
53 |
+
- **Classification**: This category comprises 5 in-distribution and 5 out-of-distribution datasets. Queries
|
54 |
+
consist of instructions and images, optionally accompanied by related text. Targets are class labels,
|
55 |
+
and the number of class labels corresponds to the number of classes in the dataset. \n
|
56 |
+
- IND: ImageNet-1k, N24News, HatefulMemes, VOC2007, SUN397 \n
|
57 |
+
- OOD: Place365, ImageNet-A, ImageNet-R, ObjectNet, Country-211 \n
|
58 |
+
- **Visual Question Answering**: This category includes 6 in-distribution and 4 out-of-distribution
|
59 |
+
datasets. The query consists of an instruction, an image, and a piece of text as the question, while
|
60 |
+
the target is the answer. Each query has 1,000 target candidates: 1 ground truth and 999 distractors. \n
|
61 |
+
- IND: OK-VQA, A-OKVQA, DocVQA, InfographicVQA, ChartQA, Visual7W \n
|
62 |
+
- OOD: ScienceQA, VizWiz, GQA, TextVQA \n
|
63 |
+
- **Information Retrieval**: This category contains 8 in-distribution and 4 out-of-distribution datasets.
|
64 |
+
Both the query and target sides can involve a combination of text, images, and instructions. Similar
|
65 |
+
to the VQA task, each query has 1,000 candidates, with 1 ground truth and 999 distractors. \n
|
66 |
+
- IND: VisDial, CIRR, VisualNews_t2i, VisualNews_i2t, MSCOCO_t2i, MSCOCO_i2t, NIGHTS, WebQA \n
|
67 |
+
- OOD: OVEN, FashionIQ, EDIS, Wiki-SS-NQ \n
|
68 |
+
- **Visual Grounding**: This category includes 1 in-distribution and 3 out-of-distribution datasets, which are adapted from object detection tasks. Queries consist of an instruction, an image, and text referring to a specific region or object within the image. The target may include a cropped image of the object or text describing the same region. Each query includes 1,000 candidates: 1 ground truth and 999 distractors. These distractors may include hard negatives from the same object class, other objects in the image, or random objects from different images. \n
|
69 |
+
- IND: MSCOCO \n
|
70 |
+
- OOD: Visual7W-Pointing, RefCOCO, RefCOCO-Matching \n
|
71 |
"""
|
72 |
|
73 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
|
81 |
SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
|
82 |
|
83 |
## ⚠ Please note that you need to submit the JSON file with the following format:
|
|
|
|
|
84 |
```json
|
85 |
[
|
86 |
{
|
87 |
"Model": "<Model Name>",
|
88 |
+
<Optional>"URL": "<Model URL>",
|
89 |
+
"Model Size(B)": 1000,
|
90 |
+
"Data Source": Self-Reported,
|
91 |
+
"Overall": 50.0,
|
92 |
+
"Classification": 50.0,
|
93 |
+
"VQA": 50.0,
|
94 |
+
"Retrieval": 50.0,
|
95 |
+
"Grounding": 50.0
|
96 |
},
|
97 |
]
|
98 |
```
|
99 |
+
You may refer to the Github page for instructions about evaluating your model.
|
100 |
+
Github link: https://github.com/TIGER-AI-Lab/VLM2Vec. \n
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
Please send us an email at [email protected], attaching the JSON file. We will review your submission and update the leaderboard accordingly.
|
102 |
"""
|
103 |
|
104 |
+
MODEL_URLS = pd.read_csv("urls.csv")
|
105 |
+
MODEL_URLS = dict(zip(MODEL_URLS['Models'], MODEL_URLS['URL']))
|
106 |
+
|
107 |
def create_hyperlinked_names(df):
|
108 |
def convert_url(url, model_name):
|
109 |
+
return f'<a href="{url}">{model_name}</a>'
|
110 |
+
|
111 |
+
def add_link_to_model_name(model_name):
|
112 |
+
if "VLM2Vec (Phi-3.5-V-" in model_name:
|
113 |
+
url = MODEL_URLS["VLM2Vec-Phi-3.5-v"]
|
114 |
+
return convert_url(url, model_name)
|
115 |
+
if "VLM2Vec (LLaVA-1.6-LoRA-" in model_name:
|
116 |
+
url = MODEL_URLS["VLM2Vec"]
|
117 |
+
return convert_url(url, model_name)
|
118 |
+
if "UniIR" in model_name:
|
119 |
+
url = MODEL_URLS["UniIR"]
|
120 |
+
return convert_url(url, model_name)
|
121 |
+
if "mmE5" in model_name:
|
122 |
+
url = MODEL_URLS["mmE5"]
|
123 |
+
return convert_url(url, model_name)
|
124 |
+
if "MMRet" in model_name:
|
125 |
+
url = MODEL_URLS["MMRet"]
|
126 |
+
return convert_url(url, model_name)
|
127 |
+
return convert_url(MODEL_URLS[model_name], model_name) if model_name in MODEL_URLS else model_name
|
128 |
|
129 |
df = df.copy()
|
130 |
+
df['Models'] = df['Models'].apply(add_link_to_model_name)
|
131 |
return df
|
132 |
|
133 |
+
def get_df():
|
134 |
+
# fetch the leaderboard data
|
135 |
+
url = "https://huggingface.co/spaces/TIGER-Lab/MMEB/resolve/main/results.csv"
|
136 |
+
response = requests.get(url, headers={"Authorization": f"Bearer {HF_TOKEN}"})
|
137 |
+
if response.status_code != 200:
|
138 |
+
import sys
|
139 |
+
sys.exit(f"Error: {response.status_code}")
|
140 |
+
df = pd.read_csv(io.StringIO(response.text))
|
141 |
+
df.to_csv(CSV_DIR, index=False) # update local file
|
|
|
|
|
|
|
|
|
142 |
df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
|
143 |
+
df = df.sort_values(by=['Overall'], ascending=False)
|
|
|
|
|
|
|
144 |
df = create_hyperlinked_names(df)
|
145 |
df['Rank'] = range(1, len(df) + 1)
|
146 |
return df
|
147 |
|
148 |
+
|
149 |
+
def add_new_eval(input_file):
|
150 |
+
if input_file is None:
|
151 |
+
return "Error! Empty file!"
|
152 |
+
|
153 |
+
# Load the input json file
|
154 |
+
upload_data = json.loads(input_file)
|
155 |
+
print("upload_data:\n", upload_data)
|
156 |
+
data_row = [f'{upload_data["Model"]}']
|
157 |
+
for col in ['Overall', 'Model Size(B)'] + TASKS:
|
158 |
+
if not col in upload_data.keys():
|
159 |
+
return f"Error! Missing {col} column!"
|
160 |
+
data_row += [upload_data[col]]
|
161 |
+
if 'URL' in upload_data.keys():
|
162 |
+
MODEL_URLS[upload_data['Model']] = upload_data['URL']
|
163 |
+
print("data_row:\n", data_row)
|
164 |
+
submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
|
165 |
+
use_auth_token=HF_TOKEN, repo_type="space")
|
166 |
+
submission_repo.git_pull()
|
167 |
+
|
168 |
+
# Track submitted models
|
169 |
+
already_submitted = []
|
170 |
+
with open(CSV_DIR, mode='r') as file:
|
171 |
+
reader = csv.reader(file, delimiter=',')
|
172 |
+
for row in reader:
|
173 |
+
already_submitted.append(row[0])
|
174 |
+
# if not in the existing models list, add it to the csv file
|
175 |
+
if data_row[0] not in already_submitted:
|
176 |
+
with open(CSV_DIR, mode='a', newline='') as file:
|
177 |
+
writer = csv.writer(file)
|
178 |
+
writer.writerow(data_row)
|
179 |
+
|
180 |
+
submission_repo.push_to_hub()
|
181 |
+
print('Submission Successful')
|
182 |
+
else:
|
183 |
+
print('The model already exists in the leaderboard!')
|
184 |
+
|
185 |
def refresh_data():
|
186 |
df = get_df()
|
187 |
return df[COLUMN_NAMES]
|
188 |
|
189 |
+
|
190 |
def search_and_filter_models(df, query, min_size, max_size):
|
191 |
filtered_df = df.copy()
|
192 |
|
|
|
223 |
except (ValueError, TypeError):
|
224 |
return 'unknown'
|
225 |
|
226 |
+
|
227 |
def filter_columns_by_tasks(df, selected_tasks=None):
|
228 |
if selected_tasks is None or len(selected_tasks) == 0:
|
229 |
return df[COLUMN_NAMES]
|
|
|
233 |
|
234 |
available_columns = [col for col in selected_columns if col in df.columns]
|
235 |
return df[available_columns]
|
236 |
+
|
237 |
+
def get_task_choices():
|
238 |
+
return TASKS
|
239 |
+
|