Hyggge commited on
Commit
50e960f
·
1 Parent(s): 1bcb6c4

feat: update processor

Browse files
Files changed (1) hide show
  1. processing_valley.py +18 -24
processing_valley.py CHANGED
@@ -90,17 +90,11 @@ class ValleyProcessor(ProcessorMixin):
90
  qwen2vl_processor_config,
91
  )
92
 
93
- max_pixels = kwargs.get("max_pixels", None)
94
- min_pixels = kwargs.get("min_pixels", None)
95
- if max_pixels:
96
- self.qwen2vl_image_processor.max_pixels = max_pixels
97
- if min_pixels:
98
- self.qwen2vl_image_processor.min_pixels = min_pixels
99
-
100
  self.anyres = kwargs.get("anyres", True)
101
  self.grid_pinpoints = kwargs.get("grid_pinpoints", "(1x1),...,(3x3)")
102
  self.only_crop_single_image = kwargs.get("only_crop_single_image", True)
103
  self.use_special_start_end_token = kwargs.get("use_special_start_end_token", True)
 
104
 
105
  def preprocess_images_siglip(self, images) -> torch.FloatTensor:
106
  if isinstance(images[0], str):
@@ -150,25 +144,15 @@ class ValleyProcessor(ProcessorMixin):
150
 
151
  return data_dict_qwen2vl
152
 
153
- def preprocess_multimodal(self, conversations, img_num):
154
  for sentence in conversations:
155
  if sentence["role"] == "system":
156
  continue
157
- if DEFAULT_VIDEO_TOKEN in sentence["content"]:
158
- if self.use_special_start_end_token:
159
- video_replace_token = (DEFAULT_VI_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_VI_END_TOKEN) * img_num
160
- else:
161
- video_replace_token = DEFAULT_IMAGE_TOKEN * img_num
162
- sentence["content"] = sentence["content"].replace(DEFAULT_VIDEO_TOKEN, "").strip()
163
- sentence["content"] = video_replace_token + "\n" + sentence["content"]
164
  else:
165
- segs = re.split(DEFAULT_IMAGE_TOKEN, sentence["content"])
166
- if self.use_special_start_end_token:
167
- sentence["content"] = (DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN).join(
168
- segs[: img_num + 1]
169
- ) + "".join(segs[img_num + 1 :])
170
- else:
171
- sentence["content"] = DEFAULT_IMAGE_TOKEN.join(segs[: img_num + 1]) + "".join(segs[img_num + 1 :])
172
 
173
  return conversations
174
 
@@ -265,6 +249,13 @@ class ValleyProcessor(ProcessorMixin):
265
 
266
 
267
  def __call__(self, messages, inference=True, **kwargs) -> BatchFeature:
 
 
 
 
 
 
 
268
  # Deal with images
269
  if "images" not in messages or not messages["images"] or not messages["images"][0]:
270
  images = [self.black_img]
@@ -289,9 +280,12 @@ class ValleyProcessor(ProcessorMixin):
289
  assert conversations[-1]["role"] == "user", "the last message should be assistant if inference=True"
290
 
291
  # Image preprocess
292
- precessed_images_siglip = self.preprocess_images_siglip(images)
 
 
 
293
  processed_data_dict_qwen2vl = self.preprocess_images_qwen2vl(images)
294
- source = self.preprocess_multimodal(conversations, len(precessed_images_siglip))
295
  data_dict = self.preprocess_qwen2(source, self.tokenizer, has_image=True, only_mask_system=False, inference=inference)
296
 
297
  # Construct batch data
 
90
  qwen2vl_processor_config,
91
  )
92
 
 
 
 
 
 
 
 
93
  self.anyres = kwargs.get("anyres", True)
94
  self.grid_pinpoints = kwargs.get("grid_pinpoints", "(1x1),...,(3x3)")
95
  self.only_crop_single_image = kwargs.get("only_crop_single_image", True)
96
  self.use_special_start_end_token = kwargs.get("use_special_start_end_token", True)
97
+ self.only_navit = kwargs.get("only_navit", False)
98
 
99
  def preprocess_images_siglip(self, images) -> torch.FloatTensor:
100
  if isinstance(images[0], str):
 
144
 
145
  return data_dict_qwen2vl
146
 
147
+ def preprocess_multimodal(self, conversations):
148
  for sentence in conversations:
149
  if sentence["role"] == "system":
150
  continue
151
+ segs = re.split(DEFAULT_IMAGE_TOKEN, sentence["content"])
152
+ if self.use_special_start_end_token:
153
+ sentence["content"] = (DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN).join(segs)
 
 
 
 
154
  else:
155
+ sentence["content"] = DEFAULT_IMAGE_TOKEN.join(segs)
 
 
 
 
 
 
156
 
157
  return conversations
158
 
 
249
 
250
 
251
  def __call__(self, messages, inference=True, **kwargs) -> BatchFeature:
252
+ max_pixels=kwargs.get("max_pixels", self.max_pixels)
253
+ min_pixels=kwargs.get("min_pixels", self.min_pixels)
254
+ if max_pixels is not None:
255
+ self.qwen2vl_image_processor.max_pixels = max_pixels
256
+ if min_pixels is not None:
257
+ self.qwen2vl_image_processor.min_pixels = min_pixels
258
+
259
  # Deal with images
260
  if "images" not in messages or not messages["images"] or not messages["images"][0]:
261
  images = [self.black_img]
 
280
  assert conversations[-1]["role"] == "user", "the last message should be assistant if inference=True"
281
 
282
  # Image preprocess
283
+ if self.only_navit:
284
+ precessed_images_siglip = None
285
+ else:
286
+ precessed_images_siglip = self.preprocess_images_siglip(images)
287
  processed_data_dict_qwen2vl = self.preprocess_images_qwen2vl(images)
288
+ source = self.preprocess_multimodal(conversations)
289
  data_dict = self.preprocess_qwen2(source, self.tokenizer, has_image=True, only_mask_system=False, inference=inference)
290
 
291
  # Construct batch data