unimer_demo / unimernet /processors /formula_processor.py
wufan's picture
Upload 111 files
18e4b60 verified
raw
history blame contribute delete
6.14 kB
from unimernet.common.registry import registry
from omegaconf import OmegaConf
import albumentations as alb
from albumentations.pytorch import ToTensorV2
from unimernet.processors.base_processor import BaseProcessor
import numpy as np
import cv2
from PIL import Image, ImageOps
from torchvision.transforms.functional import resize
import random
from unimernet.processors.formula_processor_helper.nougat import Bitmap, Dilation, Erosion
from unimernet.processors.formula_processor_helper.weather import Fog, Frost, Snow, Rain, Shadow
class FormulaImageBaseProcessor(BaseProcessor):
def __init__(self, image_size):
super(FormulaImageBaseProcessor, self).__init__()
self.input_size = [int(_) for _ in image_size]
assert len(self.input_size) == 2
@staticmethod
def crop_margin(img: Image.Image) -> Image.Image:
data = np.array(img.convert("L"))
data = data.astype(np.uint8)
max_val = data.max()
min_val = data.min()
if max_val == min_val:
return img
data = (data - min_val) / (max_val - min_val) * 255
gray = 255 * (data < 200).astype(np.uint8)
coords = cv2.findNonZero(gray) # Find all non-zero points (text)
a, b, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box
return img.crop((a, b, w + a, h + b))
def prepare_input(self, img: Image.Image, random_padding: bool = False):
"""
Convert PIL Image to tensor according to specified input_size after following steps below:
- resize
- rotate (if align_long_axis is True and image is not aligned longer axis with canvas)
- pad
"""
if img is None:
return
# crop margins
try:
img = self.crop_margin(img.convert("RGB"))
except OSError:
# might throw an error for broken files
return
if img.height == 0 or img.width == 0:
return
img = resize(img, min(self.input_size))
img.thumbnail((self.input_size[1], self.input_size[0]))
delta_width = self.input_size[1] - img.width
delta_height = self.input_size[0] - img.height
if random_padding:
pad_width = np.random.randint(low=0, high=delta_width + 1)
pad_height = np.random.randint(low=0, high=delta_height + 1)
else:
pad_width = delta_width // 2
pad_height = delta_height // 2
padding = (
pad_width,
pad_height,
delta_width - pad_width,
delta_height - pad_height,
)
return ImageOps.expand(img, padding)
@registry.register_processor("formula_image_train")
class FormulaImageTrainProcessor(FormulaImageBaseProcessor):
def __init__(self, image_size=384):
super().__init__(image_size)
self.transform = alb.Compose(
[
alb.Compose(
[
Bitmap(p=0.05),
alb.OneOf([Fog(), Frost(), Snow(), Rain(), Shadow()], p=0.2),
alb.OneOf([Erosion((2, 3)), Dilation((2, 3))], p=0.2),
alb.ShiftScaleRotate(shift_limit=0, scale_limit=(-.15, 0), rotate_limit=1, border_mode=0,
interpolation=3,
value=[255, 255, 255],
p=1),
alb.GridDistortion(distort_limit=0.1, border_mode=0, interpolation=3, value=[255, 255, 255],
p=.5)],
p=.15),
# alb.InvertImg(p=.15),
alb.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.3),
alb.GaussNoise(10, p=.2),
alb.RandomBrightnessContrast(.05, (-.2, 0), True, p=0.2),
alb.ImageCompression(95, p=.3),
alb.ToGray(always_apply=True),
alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)),
# alb.Sharpen()
ToTensorV2(),
]
)
def __call__(self, item):
img = self.prepare_input(item, random_padding=True)
if img is None:
return img
return self.transform(image=np.array(img))['image'][:1]
@classmethod
def from_config(cls, cfg=None):
if cfg is None:
cfg = OmegaConf.create()
image_size = cfg.get("image_size", [384, 384])
return cls(
image_size=image_size,
)
@registry.register_processor("formula_image_multi_scale_train")
class FormulaImageMultiScaleTrainProcessor(FormulaImageTrainProcessor):
def __init__(self, all_scales):
for i, scales in enumerate(all_scales):
all_scales[i] = [int(_) for _ in scales]
super(FormulaImageMultiScaleTrainProcessor, self).__init__(all_scales[0])
self.all_scales = all_scales
@classmethod
def from_config(cls, cfg=None):
if cfg is None:
cfg = OmegaConf.create()
all_scales = cfg.get("all_scales", [[384, 384]])
return cls(
all_scales=all_scales
)
def reset_scale(self):
self.input_size = random.choice(self.all_scales)
@registry.register_processor("formula_image_eval")
class FormulaImageEvalProcessor(FormulaImageBaseProcessor):
def __init__(self, image_size):
super().__init__(image_size)
self.transform = alb.Compose(
[
alb.ToGray(always_apply=True),
alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)),
# alb.Sharpen()
ToTensorV2(),
]
)
def __call__(self, item):
image = self.prepare_input(item)
return self.transform(image=np.array(image))['image'][:1]
@classmethod
def from_config(cls, cfg=None):
if cfg is None:
cfg = OmegaConf.create()
image_size = cfg.get("image_size", [384, 384])
return cls(image_size=image_size)