File size: 8,023 Bytes
b74e18c 7a60969 b74e18c 305557f b74e18c 3fe6c85 94494f0 b74e18c 305557f b74e18c 305557f 94494f0 305557f b74e18c 75adb33 7a60969 a9fe3a6 75adb33 b74e18c 363b5fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
---
license: apache-2.0
base_model:
- DeepGlint-AI/MLCD-Embodied-7B
---
[](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcocog?p=multi-label-cluster-discrimination-for-visual)
[](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcoco-5?p=multi-label-cluster-discrimination-for-visual)
[](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcoco-3?p=multi-label-cluster-discrimination-for-visual)
[](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcocog-1?p=multi-label-cluster-discrimination-for-visual)
[](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcoco-8?p=multi-label-cluster-discrimination-for-visual)
[](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcoco-4?p=multi-label-cluster-discrimination-for-visual)
[](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcoco-9?p=multi-label-cluster-discrimination-for-visual)
[](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcoco?p=multi-label-cluster-discrimination-for-visual)
[](https://paperswithcode.com/sota/referring-expression-segmentation-on-refcoco?p=multi-label-cluster-discrimination-for-visual)
## RefCOCO Segmentation Evaluation:
| Dataset | Split | MLCD-seg-7B | EVF-SAM | GLaMM | VisionLLM v2| LISA |
| :-- | :-: | :-: | :-: | :-: | :-: | :-: |
| RefCOCO | val | **83.6** | 82.4 | 79.5 | 79.2 | 74.9 |
| RefCOCO | testA | **85.3** | 84.2 | 83.2 | 82.3 | 79.1 |
| RefCOCO | testB | **81.5** | 80.2 | 76.9 | 77.0 | 72.3 |
| RefCOCO+ | val | **79.4** | 76.5 | 72.6 | 68.9 | 65.1 |
| RefCOCO+ | testA | **82.9** | 80.0 | 78.7 | 75.8 | 70.8 |
| RefCOCO+ | testB | **75.6** | 71.9 | 64.6 | 61.8 | 58.1 |
| RefCOCOg | val | **79.7** | 78.2 | 74.2 | 73.3 | 67.9 |
| RefCOCOg | test | **80.5** | 78.3 | 74.9 | 74.8 | 70.6 |
## Evaluation
If you just want to use this code, please refer to this sample below
```python
from transformers import AutoModel, AutoTokenizer
from PIL import Image
model_path = "DeepGlint-AI/MLCD-Seg" # or use your local path
mlcd_seg = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.float16,
trust_remote_code=True
).cuda()
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
# Assuming you have an image named test.jpg
seg_img = Image.open("test.jpg").convert('RGB')
seg_prompt = "Could you provide a segmentation mask for the right giraffe in this image?"
pred_mask = model.seg(seg_img, seg_prompt, tokenizer, force_seg=False)
```
If you want to use this code measurement dataset (e.g. refcoco), then you need to use the following method
```python
from transformers import AutoModel, AutoTokenizer
from PIL import Image
model_path = "DeepGlint-AI/MLCD-Seg" # or use your local path
mlcd_seg = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.float16,
trust_remote_code=True
).cuda()
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
# Assuming you have an image named test.jpg
seg_img = Image.open("test.jpg").convert('RGB')
seg_prompt = "Could you provide a segmentation mask for the right giraffe in this image?"
pred_mask = model.seg(seg_img, seg_prompt, tokenizer, force_seg=True)
```
If you want to use this code in video, please refer to this sample below
```python
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import torch
from torchvision import transforms
import subprocess
import os
# video path
video_path = "updownfunk.mp4"
input_dir = "frames"
output_dir = "mask_frames"
os.makedirs(input_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)
# assert you have ffmpeg installed, mp4 -> jpg
cmd = [
"ffmpeg",
"-i", video_path,
"-vf", "fps=30", # 30FPS
"-qscale:v", "1",
os.path.join(input_dir, "frame_%04d.jpg")
]
subprocess.run(cmd)
# model path
model_path = "DeepGlint-AI/MLCD-Seg" # or use your local path
mlcd_seg = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.float16,
trust_remote_code=True
).cuda()
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
# read jpgs
image_files = sorted([f for f in os.listdir(input_dir) if f.endswith(('.jpg', '.png', '.jpeg'))])
for idx, filename in enumerate(image_files, start=1):
src_path = os.path.join(input_dir, filename)
seg_img = Image.open(src_path).convert('RGB')
seg_prompt = "This <video> depicts a group of people dancing.\nCould you provide a segmentation mask for the man in pink suit?"
pred_mask = mlcd_seg.predict_forward(seg_img, seg_prompt, tokenizer, force_seg=True)
# Mask visualization
pred_mask = pred_mask.squeeze(0).cpu()
pred_mask = (pred_mask > 0.5).float()
img_tensor = transforms.ToTensor()(seg_img)
alpha = 0.2 # 20% transparency
red_mask = torch.tensor([0.0, 1.0, 0.0]).view(3, 1, 1).to(img_tensor.device) # green mask
black_bg = torch.zeros_like(img_tensor) # black background
masked_area = red_mask * alpha + img_tensor * (1 - alpha)
background = black_bg * alpha + img_tensor * (1 - alpha)
combined = torch.where(pred_mask.unsqueeze(0).bool(), masked_area, background)
combined = combined.cpu() # [3, H, W], CPU
# Save masked jpgs
new_name = f"{idx:04d}{os.path.splitext(filename)[1]}"
dst_path = os.path.join(output_dir, new_name)
transforms.ToPILImage()(combined.clamp(0, 1)).save(dst_path)
cmd = [
"ffmpeg",
"-y",
"-framerate", str(30), # fps
"-i", os.path.join(output_dir, "%04d.jpg"),
"-c:v", "libx264",
"-crf", str(23),
"-pix_fmt", "yuv420p",
"-vf", "fps=" + str(23),
"updownfunk_mask.mp4" # output video
]
# jpgs -> mp4
subprocess.run(cmd, check=True)
```
## Example
<img src="https://github.com/user-attachments/assets/85c023a1-3e0c-4ea5-a764-1eb9ee0fbddf" alt="output" width="1024"/>
<img src="https://github.com/user-attachments/assets/5b767327-bd0a-4185-8f7e-b1ab0aa260c9" alt="output" width="1024"/>
<video width="80%" controls>
<source src="https://github.com/user-attachments/assets/380dee0d-47c4-4e01-8ff0-e69e62cccd7c">
</video>
## Citations
```
@misc{mlcdseg_wukun,
author = {Wu, Kun and Xie, Yin and Zhou, Xinyu and An, Xiang, and Deng, Jiankang, and Jie, Yu},
title = {MLCD-Seg},
year = {2025},
url = {https://github.com/deepglint/unicom/tree/main/downstream},
}
```
|