bytedance-research
/

Valley-Eagle-7B

Safetensors

valley

custom_code

Model card Files Files and versions Community

Hyggge commited on Mar 23

Commit

50e960f

1 Parent(s): 1bcb6c4

feat: update processor

Browse files

Files changed (1) hide show

processing_valley.py +18 -24

processing_valley.py CHANGED Viewed

@@ -90,17 +90,11 @@ class ValleyProcessor(ProcessorMixin):
             qwen2vl_processor_config,
         )
-        max_pixels = kwargs.get("max_pixels", None)
-        min_pixels = kwargs.get("min_pixels", None)
-        if max_pixels:
-            self.qwen2vl_image_processor.max_pixels = max_pixels
-        if min_pixels:
-            self.qwen2vl_image_processor.min_pixels = min_pixels
         self.anyres = kwargs.get("anyres", True)
         self.grid_pinpoints = kwargs.get("grid_pinpoints", "(1x1),...,(3x3)")
         self.only_crop_single_image = kwargs.get("only_crop_single_image", True)
         self.use_special_start_end_token = kwargs.get("use_special_start_end_token", True)
     def preprocess_images_siglip(self, images) -> torch.FloatTensor:
         if isinstance(images[0], str):
@@ -150,25 +144,15 @@ class ValleyProcessor(ProcessorMixin):
         return data_dict_qwen2vl
-    def preprocess_multimodal(self, conversations, img_num):
         for sentence in conversations:
             if sentence["role"] == "system":
                 continue
-            if DEFAULT_VIDEO_TOKEN in sentence["content"]:
-                if self.use_special_start_end_token:
-                    video_replace_token = (DEFAULT_VI_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_VI_END_TOKEN) * img_num
-                else:
-                    video_replace_token = DEFAULT_IMAGE_TOKEN * img_num
-                sentence["content"] = sentence["content"].replace(DEFAULT_VIDEO_TOKEN, "").strip()
-                sentence["content"] = video_replace_token + "\n" + sentence["content"]
             else:
-                segs = re.split(DEFAULT_IMAGE_TOKEN, sentence["content"])
-                if self.use_special_start_end_token:
-                    sentence["content"] = (DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN).join(
-                        segs[: img_num + 1]
-                    ) + "".join(segs[img_num + 1 :])
-                else:
-                    sentence["content"] = DEFAULT_IMAGE_TOKEN.join(segs[: img_num + 1]) + "".join(segs[img_num + 1 :])
         return conversations
@@ -265,6 +249,13 @@ class ValleyProcessor(ProcessorMixin):
     def __call__(self, messages, inference=True, **kwargs) -> BatchFeature:
         # Deal with images
         if "images" not in messages or not messages["images"] or not messages["images"][0]:
             images = [self.black_img]
@@ -289,9 +280,12 @@ class ValleyProcessor(ProcessorMixin):
             assert conversations[-1]["role"] == "user", "the last message should be assistant if inference=True"
         # Image preprocess
-        precessed_images_siglip = self.preprocess_images_siglip(images)
         processed_data_dict_qwen2vl = self.preprocess_images_qwen2vl(images)
-        source = self.preprocess_multimodal(conversations, len(precessed_images_siglip))
         data_dict = self.preprocess_qwen2(source, self.tokenizer, has_image=True, only_mask_system=False, inference=inference)
         # Construct batch data

             qwen2vl_processor_config,
         )
         self.anyres = kwargs.get("anyres", True)
         self.grid_pinpoints = kwargs.get("grid_pinpoints", "(1x1),...,(3x3)")
         self.only_crop_single_image = kwargs.get("only_crop_single_image", True)
         self.use_special_start_end_token = kwargs.get("use_special_start_end_token", True)
+        self.only_navit = kwargs.get("only_navit", False)
     def preprocess_images_siglip(self, images) -> torch.FloatTensor:
         if isinstance(images[0], str):
         return data_dict_qwen2vl
+    def preprocess_multimodal(self, conversations):
         for sentence in conversations:
             if sentence["role"] == "system":
                 continue
+            segs = re.split(DEFAULT_IMAGE_TOKEN, sentence["content"])
+            if self.use_special_start_end_token:
+                sentence["content"] = (DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN).join(segs)
             else:
+                sentence["content"] = DEFAULT_IMAGE_TOKEN.join(segs)
         return conversations
     def __call__(self, messages, inference=True, **kwargs) -> BatchFeature:
+        max_pixels=kwargs.get("max_pixels", self.max_pixels)
+        min_pixels=kwargs.get("min_pixels", self.min_pixels)
+        if max_pixels is not None:
+            self.qwen2vl_image_processor.max_pixels = max_pixels
+        if min_pixels is not None:
+            self.qwen2vl_image_processor.min_pixels = min_pixels
         # Deal with images
         if "images" not in messages or not messages["images"] or not messages["images"][0]:
             images = [self.black_img]
             assert conversations[-1]["role"] == "user", "the last message should be assistant if inference=True"
         # Image preprocess
+        if self.only_navit:
+            precessed_images_siglip = None
+        else:
+            precessed_images_siglip = self.preprocess_images_siglip(images)
         processed_data_dict_qwen2vl = self.preprocess_images_qwen2vl(images)
+        source = self.preprocess_multimodal(conversations)
         data_dict = self.preprocess_qwen2(source, self.tokenizer, has_image=True, only_mask_system=False, inference=inference)
         # Construct batch data