from transformers import CLIPImageProcessor, AutoTokenizer class QWenVLProcessor: def __init__(self, tokenizer, image_processor): self.tokenizer = tokenizer self.image_processor = image_processor @classmethod def from_pretrained(cls, model_id, **kwargs): tokenizer = AutoTokenizer.from_pretrained(model_id, **kwargs) image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14") return cls(tokenizer=tokenizer, image_processor=image_processor) def __call__(self, text=None, images=None, return_tensors=None): if images is not None: image_inputs = self.image_processor(images, return_tensors=return_tensors) else: image_inputs = {} if text is not None: text_inputs = self.tokenizer(text, return_tensors=return_tensors, padding=True) else: text_inputs = {} return {**text_inputs, **image_inputs} def batch_decode(self, *args, **kwargs): return self.tokenizer.batch_decode(*args, **kwargs)