|
from transformers import CLIPImageProcessor, AutoTokenizer |
|
|
|
class QWenVLProcessor: |
|
def __init__(self, tokenizer, image_processor): |
|
self.tokenizer = tokenizer |
|
self.image_processor = image_processor |
|
|
|
@classmethod |
|
def from_pretrained(cls, model_id, **kwargs): |
|
tokenizer = AutoTokenizer.from_pretrained(model_id, **kwargs) |
|
image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14") |
|
return cls(tokenizer=tokenizer, image_processor=image_processor) |
|
|
|
def __call__(self, text=None, images=None, return_tensors=None): |
|
if images is not None: |
|
image_inputs = self.image_processor(images, return_tensors=return_tensors) |
|
else: |
|
image_inputs = {} |
|
if text is not None: |
|
text_inputs = self.tokenizer(text, return_tensors=return_tensors, padding=True) |
|
else: |
|
text_inputs = {} |
|
return {**text_inputs, **image_inputs} |
|
|
|
def batch_decode(self, *args, **kwargs): |
|
return self.tokenizer.batch_decode(*args, **kwargs) |