|
|
|
|
|
import torch |
|
import os |
|
|
|
from modelscope.models.base import TorchModel |
|
from modelscope.preprocessors.base import Preprocessor |
|
from modelscope.pipelines.base import Model, Pipeline |
|
from modelscope.utils.config import Config |
|
from modelscope.pipelines.builder import PIPELINES |
|
from modelscope.preprocessors.builder import PREPROCESSORS |
|
from modelscope.models.builder import MODELS |
|
from modelscope.preprocessors.image import load_image |
|
|
|
|
|
from vlmo.utils.beit_utils import load_from_config |
|
|
|
|
|
@PIPELINES.register_module( |
|
"multi-modal-embeddings", module_name="multi-modal-embedding-pipeline" |
|
) |
|
class MyCustomPipeline(Pipeline): |
|
"""Give simple introduction to this pipeline. |
|
|
|
Examples: |
|
|
|
>>> from modelscope.pipelines import pipeline |
|
>>> input = "Hello, ModelScope!" |
|
>>> my_pipeline = pipeline('my-task', 'my-model-id') |
|
>>> result = my_pipeline(input) |
|
|
|
""" |
|
|
|
def __init__(self, model, preprocessor=None, **kwargs): |
|
""" |
|
use `model` and `preprocessor` to create a custom pipeline for prediction |
|
Args: |
|
model: model id on modelscope hub. |
|
preprocessor: the class of method be init_preprocessor |
|
""" |
|
super().__init__(model=model, auto_collate=False) |
|
self.model_dir = model |
|
self._device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_config = { |
|
"loss_names": {"itc": 1}, |
|
"beit_version": "large", |
|
"encoder_embed_dim": 1024, |
|
"out_embed_dim": 1024, |
|
"encoder_layers": 21, |
|
"beit3_vl_layers": 3, |
|
|
|
"visual_mask_size": 14, |
|
"tokenizer_type": "GLMChineseTokenizer", |
|
"tokenizer": os.path.join(self.model_dir, "./vlmo/tokenizer"), |
|
"vocab_size": 115244, |
|
"whole_word_masking": False, |
|
"precision": 32, |
|
"test_only": True, |
|
"flash_attn": True, |
|
"model_path": os.path.join(self.model_dir, "m2_encoder_1B.ckpt"), |
|
"modelscope": { |
|
"model_id": "M2Cognition/M2_Encoder_Large" |
|
}, |
|
"model_file": "m2_encoder_1B.ckpt" |
|
} |
|
model, processors = load_from_config(model_config) |
|
self.model = model |
|
self.model.to(self._device).eval() |
|
self._tokenizer, self._img_processor = processors |
|
|
|
def _sanitize_parameters(self, **pipeline_parameters): |
|
""" |
|
this method should sanitize the keyword args to preprocessor params, |
|
forward params and postprocess params on '__call__' or '_process_single' method |
|
considered to be a normal classmethod with default implementation / output |
|
|
|
Default Returns: |
|
Dict[str, str]: preprocess_params = {} |
|
Dict[str, str]: forward_params = {} |
|
Dict[str, str]: postprocess_params = pipeline_parameters |
|
""" |
|
return {}, pipeline_parameters, {} |
|
|
|
def _check_input(self, inputs): |
|
pass |
|
|
|
def _check_output(self, outputs): |
|
pass |
|
|
|
def forward(self, forward_params): |
|
"""Provide default implementation using self.model and user can reimplement it""" |
|
|
|
labels = forward_params.get("label_list", "") |
|
labels = labels.split(",") |
|
if len(labels) > 1 and labels[0] != "": |
|
txt_encoding = self._tokenizer( |
|
labels, |
|
padding="max_length", |
|
truncation=True, |
|
max_length=self.model.hparams.config["max_text_len"], |
|
return_special_tokens_mask=True, |
|
) |
|
txt_data = { |
|
"text_ids": torch.tensor(txt_encoding["input_ids"]).to(self._device), |
|
"text_masks": torch.tensor(txt_encoding["attention_mask"]).to( |
|
self._device |
|
), |
|
"text_labels": None, |
|
} |
|
txt_feats = self.model.infer_text(txt_data)["cls_vlffn_feats"] |
|
image = forward_params["image"] |
|
image = load_image(image) |
|
img = self._img_processor(image).unsqueeze(0) |
|
img_data = {"image": [img.to(self._device)]} |
|
img_feats = self.model.infer_image(img_data)["cls_vlffn_feats"] |
|
logits_per_image = self.model.logit_scale.exp() * img_feats @ txt_feats.t() |
|
probs = logits_per_image.softmax(dim=-1).detach().cpu() |
|
index = probs.max(dim=-1)[1][0] |
|
label = labels[index] |
|
return {"text": label, "scores": probs.numpy().tolist()[0]} |
|
else: |
|
rets = {} |
|
if "text" in forward_params: |
|
text = forward_params.get("text") |
|
txt_encoding = self._tokenizer( |
|
text, |
|
padding="max_length", |
|
truncation=True, |
|
max_length=self.model.hparams.config["max_text_len"], |
|
return_special_tokens_mask=True, |
|
) |
|
txt_data = { |
|
"text_ids": torch.tensor(txt_encoding["input_ids"]).to( |
|
self._device |
|
), |
|
"text_masks": torch.tensor(txt_encoding["attention_mask"]).to( |
|
self._device |
|
), |
|
"text_labels": None, |
|
} |
|
txt_feats = self.model.infer_text(txt_data)["cls_vlffn_feats"] |
|
rets.update({"text_embedding": txt_feats.detach()}) |
|
if "img" in forward_params: |
|
input_img = forward_params["img"] |
|
img = self._img_processor(input_img).unsqueeze(0) |
|
img_data = {"image": [img.to(self._device)]} |
|
img_feats = self.model.infer_image(img_data)["cls_vlffn_feats"] |
|
rets.update({"img_embedding": img_feats.detach()}) |
|
|
|
return rets |
|
|
|
def preprocess(self, inputs): |
|
return inputs |
|
|
|
def postprocess(self, inputs): |
|
"""If current pipeline support model reuse, common postprocess |
|
code should be write here. |
|
|
|
Args: |
|
inputs: input data |
|
|
|
Return: |
|
dict of results: a dict containing outputs of model, each |
|
output should have the standard output name. |
|
""" |
|
return inputs |
|
|
|
|
|
""" |
|
# Tips: usr_config_path is the temporary save configuration location, after upload modelscope hub, it is the model_id |
|
usr_config_path = "/tmp/snapdown/" |
|
config = Config( |
|
{ |
|
"framework": "pytorch", |
|
"task": "multi-modal-embeddings", |
|
"model": {"type": "m2-encoder"}, |
|
"pipeline": {"type": "multi-modal-embedding-pipeline"}, |
|
"allow_remote": True, |
|
} |
|
) |
|
config.dump("/tmp/snapdown/" + "configuration.json") |
|
""" |
|
|
|
if __name__ == "__main__": |
|
from modelscope.pipelines import pipeline |
|
from modelscope.preprocessors.image import load_image |
|
|
|
model = "M2Cognition/M2-Encoder" |
|
pipe = pipeline("multi-modal-embeddings", model=model) |
|
input = { |
|
"image": "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg", |
|
"label_list": "杰尼龟,妙蛙种子,小火龙,皮卡丘", |
|
} |
|
demo = pipe(input) |
|
print("demo output", demo) |
|
inputs = {"text": ["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"]} |
|
output = pipe(inputs) |
|
print("text output", output) |
|
input_img = load_image( |
|
"https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg" |
|
) |
|
inputs = {"img": input_img} |
|
img_embedding = pipe(inputs) |
|
print("image output", img_embedding) |
|
|