ProPainter12

Paused

App Files Files Community

goryhon commited on 30 days ago

Commit

b0cf186

verified ·

1 Parent(s): dbca871

Update web-demos/hugging_face/inpainter/base_inpainter.py

Browse files

Files changed (1) hide show

web-demos/hugging_face/inpainter/base_inpainter.py +332 -342

web-demos/hugging_face/inpainter/base_inpainter.py CHANGED Viewed

@@ -20,367 +20,357 @@ warnings.filterwarnings("ignore")
 def imwrite(img, file_path, params=None, auto_mkdir=True):
-   if auto_mkdir:
-      dir_name = os.path.abspath(os.path.dirname(file_path))
-      os.makedirs(dir_name, exist_ok=True)
-   return cv2.imwrite(file_path, img, params)
 def resize_frames(frames, size=None):
-   if size is not None:
-      out_size = size
-      process_size = (out_size[0]-out_size[0]%8, out_size[1]-out_size[1]%8)
-      frames = [f.resize(process_size) for f in frames]
-   else:
-      out_size = frames[0].size
-      process_size = (out_size[0]-out_size[0]%8, out_size[1]-out_size[1]%8)
-      if not out_size == process_size:
-         frames = [f.resize(process_size) for f in frames]
-   return frames, process_size, out_size
 def read_frame_from_videos(frame_root):
-   if frame_root.endswith(('mp4', 'mov', 'avi', 'MP4', 'MOV', 'AVI')): # input video path
-      video_name = os.path.basename(frame_root)[:-4]
-      vframes, aframes, info = torchvision.io.read_video(filename=frame_root, pts_unit='sec') # RGB
-      frames = list(vframes.numpy())
-      frames = [Image.fromarray(f) for f in frames]
-      fps = info['video_fps']
-   else:
-      video_name = os.path.basename(frame_root)
-      frames = []
-      fr_lst = sorted(os.listdir(frame_root))
-      for fr in fr_lst:
-         frame = cv2.imread(os.path.join(frame_root, fr))
-         frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-         frames.append(frame)
-      fps = None
-   size = frames[0].size
-   return frames, fps, size, video_name
 def binary_mask(mask, th=0.1):
-   mask[mask>th] = 1
-   mask[mask<=th] = 0
-   return mask
 def extrapolation(video_ori, scale):
-   """Prepares the data for video outpainting.
-   """
-   nFrame = len(video_ori)
-   imgW, imgH = video_ori[0].size
-   # Defines new FOV.
-   imgH_extr = int(scale[0] * imgH)
-   imgW_extr = int(scale[1] * imgW)
-   imgH_extr = imgH_extr - imgH_extr % 8
-   imgW_extr = imgW_extr - imgW_extr % 8
-   H_start = int((imgH_extr - imgH) / 2)
-   W_start = int((imgW_extr - imgW) / 2)
-   # Extrapolates the FOV for video.
-   frames = []
-   for v in video_ori:
-      frame = np.zeros(((imgH_extr, imgW_extr, 3)), dtype=np.uint8)
-      frame[H_start: H_start + imgH, W_start: W_start + imgW, :] = v
-      frames.append(Image.fromarray(frame))
-   # Generates the mask for missing region.
-   masks_dilated = []
-   flow_masks = []
-   dilate_h = 4 if H_start > 10 else 0
-   dilate_w = 4 if W_start > 10 else 0
-   mask = np.ones(((imgH_extr, imgW_extr)), dtype=np.uint8)
-   mask[H_start+dilate_h: H_start+imgH-dilate_h,
-       W_start+dilate_w: W_start+imgW-dilate_w] = 0
-   flow_masks.append(Image.fromarray(mask * 255))
-   mask[H_start: H_start+imgH, W_start: W_start+imgW] = 0
-   masks_dilated.append(Image.fromarray(mask * 255))
-   flow_masks = flow_masks * nFrame
-   masks_dilated = masks_dilated * nFrame
-   return frames, flow_masks, masks_dilated, (imgW_extr, imgH_extr)
 def get_ref_index(mid_neighbor_id, neighbor_ids, length, ref_stride=10, ref_num=-1):
-   ref_index = []
-   if ref_num == -1:
-      for i in range(0, length, ref_stride):
-         if i not in neighbor_ids:
-            ref_index.append(i)
-   else:
-      start_idx = max(0, mid_neighbor_id - ref_stride * (ref_num // 2))
-      end_idx = min(length, mid_neighbor_id + ref_stride * (ref_num // 2))
-      for i in range(start_idx, end_idx, ref_stride):
-         if i not in neighbor_ids:
-            if len(ref_index) > ref_num:
-               break
-            ref_index.append(i)
-   return ref_index
 def read_mask_demo(masks, length, size, flow_mask_dilates=8, mask_dilates=5):
-      masks_img = []
-      masks_dilated = []
-      flow_masks = []
-      for mp in masks:
-         masks_img.append(Image.fromarray(mp.astype('uint8')))
-      for mask_img in masks_img:
-         if size is not None:
-            mask_img = mask_img.resize(size, Image.NEAREST)
-         mask_img = np.array(mask_img.convert('L'))
-         # Dilate 8 pixel so that all known pixel is trustworthy
-         if flow_mask_dilates > 0:
-            flow_mask_img = scipy.ndimage.binary_dilation(mask_img, iterations=flow_mask_dilates).astype(np.uint8)
-         else:
-            flow_mask_img = binary_mask(mask_img).astype(np.uint8)
-         flow_masks.append(Image.fromarray(flow_mask_img * 255))
-         if mask_dilates > 0:
-            mask_img = scipy.ndimage.binary_dilation(mask_img, iterations=mask_dilates).astype(np.uint8)
-         else:
-            mask_img = binary_mask(mask_img).astype(np.uint8)
-         masks_dilated.append(Image.fromarray(mask_img * 255))
-      if len(masks_img) == 1:
-         flow_masks = flow_masks * length
-         masks_dilated = masks_dilated * length
-      return flow_masks, masks_dilated
 class ProInpainter:
-   def __init__(self, propainter_checkpoint, raft_checkpoint, flow_completion_checkpoint, device="cuda:0", use_half=True):
-      self.device = device
-      self.use_half = use_half
-      if self.device == torch.device('cpu'):
-               self.use_half = False
-      ##############################################
-      # set up RAFT and flow competition model
-      ##############################################
-      self.fix_raft = RAFT_bi(raft_checkpoint, self.device)
-      self.fix_flow_complete = RecurrentFlowCompleteNet(flow_completion_checkpoint)
-      for p in self.fix_flow_complete.parameters():
-         p.requires_grad = False
-      self.fix_flow_complete.to(self.device)
-      self.fix_flow_complete.eval()
-      ##############################################
-      # set up ProPainter model
-      ##############################################
-      self.model = InpaintGenerator(model_path=propainter_checkpoint).to(self.device)
-      self.model.eval()
-      if self.use_half:
-         self.fix_flow_complete = self.fix_flow_complete.half()
-         self.model = self.model.half()
-   def inpaint(self, npframes, masks, ratio=1.0, dilate_radius=4, raft_iter=20, subvideo_length=80, neighbor_length=10, ref_stride=10):
-      """
-      Perform Inpainting for video subsets
-      Output:
-      inpainted_frames: numpy array, T, H, W, 3
-      """
-         frames = []
-         for i in range(len(npframes)):
-            frames.append(Image.fromarray(npframes[i].astype('uint8'), mode="RGB"))
-         del npframes
-         # Получаем оригинальный размер
-         size = frames[0].size  # (width, height)
-         # Применяем ratio, только если он отличается от 1.0
-         if ratio != 1.0:
-            size = (int(ratio * size[0]) // 2 * 2, int(ratio * size[1]) // 2 * 2)
-         else:
-            size = (size[0] // 2 * 2, size[1] // 2 * 2)  # просто округляем до ближайшего чётного
-         frames_len = len(frames)
-         # ⚠️ resize_frames больше не меняет разрешение, если оно уже чётное
-         frames, size, out_size = resize_frames(frames, size)
-         flow_masks, masks_dilated = read_mask_demo(masks, frames_len, size, dilate_radius, dilate_radius)
-         w, h = size
-         frames_inp = [np.array(f).astype(np.uint8) for f in frames]
-         frames = to_tensors()(frames).unsqueeze(0) * 2 - 1
-         flow_masks = to_tensors()(flow_masks).unsqueeze(0)
-         masks_dilated = to_tensors()(masks_dilated).unsqueeze(0)
-         frames = frames.to(self.device)
-         flow_masks = flow_masks.to(self.device)
-         masks_dilated = masks_dilated.to(self.device)
-      ##############################################
-      # ProPainter inference
-      ##############################################
-      video_length = frames.size(1)
-      with torch.no_grad():
-         # ---- compute flow ----
-         if frames.size(-1) <= 640:
-            short_clip_len = 12
-         elif frames.size(-1) <= 720:
-            short_clip_len = 8
-         elif frames.size(-1) <= 1280:
-            short_clip_len = 4
-         else:
-            short_clip_len = 2
-         # use fp32 for RAFT
-         if frames.size(1) > short_clip_len:
-            gt_flows_f_list, gt_flows_b_list = [], []
-            for f in range(0, video_length, short_clip_len):
-               end_f = min(video_length, f + short_clip_len)
-               if f == 0:
-                  flows_f, flows_b = self.fix_raft(frames[:,f:end_f], iters=raft_iter)
-               else:
-                  flows_f, flows_b = self.fix_raft(frames[:,f-1:end_f], iters=raft_iter)
-               gt_flows_f_list.append(flows_f)
-               gt_flows_b_list.append(flows_b)
-               torch.cuda.empty_cache()
-            gt_flows_f = torch.cat(gt_flows_f_list, dim=1)
-            gt_flows_b = torch.cat(gt_flows_b_list, dim=1)
-            gt_flows_bi = (gt_flows_f, gt_flows_b)
-         else:
-            gt_flows_bi = self.fix_raft(frames, iters=raft_iter)
-            torch.cuda.empty_cache()
-         if self.use_half:
-            frames, flow_masks, masks_dilated = frames.half(), flow_masks.half(), masks_dilated.half()
-            gt_flows_bi = (gt_flows_bi[0].half(), gt_flows_bi[1].half())
-         # ---- complete flow ----
-         flow_length = gt_flows_bi[0].size(1)
-         if flow_length > subvideo_length:
-            pred_flows_f, pred_flows_b = [], []
-            pad_len = 5
-            for f in range(0, flow_length, subvideo_length):
-               s_f = max(0, f - pad_len)
-               e_f = min(flow_length, f + subvideo_length + pad_len)
-               pad_len_s = max(0, f) - s_f
-               pad_len_e = e_f - min(flow_length, f + subvideo_length)
-               pred_flows_bi_sub, _ = self.fix_flow_complete.forward_bidirect_flow(
-                  (gt_flows_bi[0][:, s_f:e_f], gt_flows_bi[1][:, s_f:e_f]),
-                  flow_masks[:, s_f:e_f+1])
-               pred_flows_bi_sub = self.fix_flow_complete.combine_flow(
-                  (gt_flows_bi[0][:, s_f:e_f], gt_flows_bi[1][:, s_f:e_f]),
-                  pred_flows_bi_sub,
-                  flow_masks[:, s_f:e_f+1])
-               pred_flows_f.append(pred_flows_bi_sub[0][:, pad_len_s:e_f-s_f-pad_len_e])
-               pred_flows_b.append(pred_flows_bi_sub[1][:, pad_len_s:e_f-s_f-pad_len_e])
-               torch.cuda.empty_cache()
-            pred_flows_f = torch.cat(pred_flows_f, dim=1)
-            pred_flows_b = torch.cat(pred_flows_b, dim=1)
-            pred_flows_bi = (pred_flows_f, pred_flows_b)
-         else:
-            pred_flows_bi, _ = self.fix_flow_complete.forward_bidirect_flow(gt_flows_bi, flow_masks)
-            pred_flows_bi = self.fix_flow_complete.combine_flow(gt_flows_bi, pred_flows_bi, flow_masks)
-            torch.cuda.empty_cache()
-         # ---- image propagation ----
-         masked_frames = frames * (1 - masks_dilated)
-         subvideo_length_img_prop = min(100, subvideo_length) # ensure a minimum of 100 frames for image propagation
-         if video_length > subvideo_length_img_prop:
-            updated_frames, updated_masks = [], []
-            pad_len = 10
-            for f in range(0, video_length, subvideo_length_img_prop):
-               s_f = max(0, f - pad_len)
-               e_f = min(video_length, f + subvideo_length_img_prop + pad_len)
-               pad_len_s = max(0, f) - s_f
-               pad_len_e = e_f - min(video_length, f + subvideo_length_img_prop)
-               b, t, _, _, _ = masks_dilated[:, s_f:e_f].size()
-               pred_flows_bi_sub = (pred_flows_bi[0][:, s_f:e_f-1], pred_flows_bi[1][:, s_f:e_f-1])
-               prop_imgs_sub, updated_local_masks_sub = self.model.img_propagation(masked_frames[:, s_f:e_f],
-                                                      pred_flows_bi_sub,
-                                                      masks_dilated[:, s_f:e_f],
-                                                      'nearest')
-               updated_frames_sub = frames[:, s_f:e_f] * (1 - masks_dilated[:, s_f:e_f]) + \
-                              prop_imgs_sub.view(b, t, 3, h, w) * masks_dilated[:, s_f:e_f]
-               updated_masks_sub = updated_local_masks_sub.view(b, t, 1, h, w)
-               updated_frames.append(updated_frames_sub[:, pad_len_s:e_f-s_f-pad_len_e])
-               updated_masks.append(updated_masks_sub[:, pad_len_s:e_f-s_f-pad_len_e])
-               torch.cuda.empty_cache()
-            updated_frames = torch.cat(updated_frames, dim=1)
-            updated_masks = torch.cat(updated_masks, dim=1)
-         else:
-            b, t, _, _, _ = masks_dilated.size()
-            prop_imgs, updated_local_masks = self.model.img_propagation(masked_frames, pred_flows_bi, masks_dilated, 'nearest')
-            updated_frames = frames * (1 - masks_dilated) + prop_imgs.view(b, t, 3, h, w) * masks_dilated
-            updated_masks = updated_local_masks.view(b, t, 1, h, w)
-            torch.cuda.empty_cache()
-      ori_frames = frames_inp
-      comp_frames = [None] * video_length
-      neighbor_stride = neighbor_length // 2
-      if video_length > subvideo_length:
-         ref_num = subvideo_length // ref_stride
-      else:
-         ref_num = -1
-      # ---- feature propagation + transformer ----
-      for f in tqdm(range(0, video_length, neighbor_stride)):
-         neighbor_ids = [
-            i for i in range(max(0, f - neighbor_stride),
-                           min(video_length, f + neighbor_stride + 1))
-         ]
-         ref_ids = get_ref_index(f, neighbor_ids, video_length, ref_stride, ref_num)
-         selected_imgs = updated_frames[:, neighbor_ids + ref_ids, :, :, :]
-         selected_masks = masks_dilated[:, neighbor_ids + ref_ids, :, :, :]
-         selected_update_masks = updated_masks[:, neighbor_ids + ref_ids, :, :, :]
-         selected_pred_flows_bi = (pred_flows_bi[0][:, neighbor_ids[:-1], :, :, :], pred_flows_bi[1][:, neighbor_ids[:-1], :, :, :])
-         with torch.no_grad():
-            # 1.0 indicates mask
-            l_t = len(neighbor_ids)
-            # pred_img = selected_imgs # results of image propagation
-            pred_img = self.model(selected_imgs, selected_pred_flows_bi, selected_masks, selected_update_masks, l_t)
-            pred_img = pred_img.view(-1, 3, h, w)
-            pred_img = (pred_img + 1) / 2
-            pred_img = pred_img.cpu().permute(0, 2, 3, 1).numpy() * 255
-            binary_masks = masks_dilated[0, neighbor_ids, :, :, :].cpu().permute(
-               0, 2, 3, 1).numpy().astype(np.uint8)
-            for i in range(len(neighbor_ids)):
-               idx = neighbor_ids[i]
-               img = np.array(pred_img[i]).astype(np.uint8) * binary_masks[i] \
-                  + ori_frames[idx] * (1 - binary_masks[i])
-               if comp_frames[idx] is None:
-                  comp_frames[idx] = img
-               else:
-                  comp_frames[idx] = comp_frames[idx].astype(np.float32) * 0.5 + img.astype(np.float32) * 0.5
-               comp_frames[idx] = comp_frames[idx].astype(np.uint8)
-         torch.cuda.empty_cache()
-      # need to return numpy array, T, H, W, 3
-      comp_frames = [cv2.resize(f, out_size) for f in comp_frames]
-      return comp_frames

 def imwrite(img, file_path, params=None, auto_mkdir=True):
+	if auto_mkdir:
+		dir_name = os.path.abspath(os.path.dirname(file_path))
+		os.makedirs(dir_name, exist_ok=True)
+	return cv2.imwrite(file_path, img, params)
 def resize_frames(frames, size=None):
+	if size is not None:
+		out_size = size
+		process_size = (out_size[0]-out_size[0]%8, out_size[1]-out_size[1]%8)
+		frames = [f.resize(process_size) for f in frames]
+	else:
+		out_size = frames[0].size
+		process_size = (out_size[0]-out_size[0]%8, out_size[1]-out_size[1]%8)
+		if not out_size == process_size:
+			frames = [f.resize(process_size) for f in frames]
+	return frames, process_size, out_size
 def read_frame_from_videos(frame_root):
+	if frame_root.endswith(('mp4', 'mov', 'avi', 'MP4', 'MOV', 'AVI')): # input video path
+		video_name = os.path.basename(frame_root)[:-4]
+		vframes, aframes, info = torchvision.io.read_video(filename=frame_root, pts_unit='sec') # RGB
+		frames = list(vframes.numpy())
+		frames = [Image.fromarray(f) for f in frames]
+		fps = info['video_fps']
+	else:
+		video_name = os.path.basename(frame_root)
+		frames = []
+		fr_lst = sorted(os.listdir(frame_root))
+		for fr in fr_lst:
+			frame = cv2.imread(os.path.join(frame_root, fr))
+			frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+			frames.append(frame)
+		fps = None
+	size = frames[0].size
+	return frames, fps, size, video_name
 def binary_mask(mask, th=0.1):
+	mask[mask>th] = 1
+	mask[mask<=th] = 0
+	return mask
 def extrapolation(video_ori, scale):
+	"""Prepares the data for video outpainting.
+	"""
+	nFrame = len(video_ori)
+	imgW, imgH = video_ori[0].size
+	# Defines new FOV.
+	imgH_extr = int(scale[0] * imgH)
+	imgW_extr = int(scale[1] * imgW)
+	imgH_extr = imgH_extr - imgH_extr % 8
+	imgW_extr = imgW_extr - imgW_extr % 8
+	H_start = int((imgH_extr - imgH) / 2)
+	W_start = int((imgW_extr - imgW) / 2)
+	# Extrapolates the FOV for video.
+	frames = []
+	for v in video_ori:
+		frame = np.zeros(((imgH_extr, imgW_extr, 3)), dtype=np.uint8)
+		frame[H_start: H_start + imgH, W_start: W_start + imgW, :] = v
+		frames.append(Image.fromarray(frame))
+	# Generates the mask for missing region.
+	masks_dilated = []
+	flow_masks = []
+	dilate_h = 4 if H_start > 10 else 0
+	dilate_w = 4 if W_start > 10 else 0
+	mask = np.ones(((imgH_extr, imgW_extr)), dtype=np.uint8)
+	mask[H_start+dilate_h: H_start+imgH-dilate_h,
+		 W_start+dilate_w: W_start+imgW-dilate_w] = 0
+	flow_masks.append(Image.fromarray(mask * 255))
+	mask[H_start: H_start+imgH, W_start: W_start+imgW] = 0
+	masks_dilated.append(Image.fromarray(mask * 255))
+	flow_masks = flow_masks * nFrame
+	masks_dilated = masks_dilated * nFrame
+	return frames, flow_masks, masks_dilated, (imgW_extr, imgH_extr)
 def get_ref_index(mid_neighbor_id, neighbor_ids, length, ref_stride=10, ref_num=-1):
+	ref_index = []
+	if ref_num == -1:
+		for i in range(0, length, ref_stride):
+			if i not in neighbor_ids:
+				ref_index.append(i)
+	else:
+		start_idx = max(0, mid_neighbor_id - ref_stride * (ref_num // 2))
+		end_idx = min(length, mid_neighbor_id + ref_stride * (ref_num // 2))
+		for i in range(start_idx, end_idx, ref_stride):
+			if i not in neighbor_ids:
+				if len(ref_index) > ref_num:
+					break
+				ref_index.append(i)
+	return ref_index
 def read_mask_demo(masks, length, size, flow_mask_dilates=8, mask_dilates=5):
+		masks_img = []
+		masks_dilated = []
+		flow_masks = []
+		for mp in masks:
+			masks_img.append(Image.fromarray(mp.astype('uint8')))
+		for mask_img in masks_img:
+			if size is not None:
+				mask_img = mask_img.resize(size, Image.NEAREST)
+			mask_img = np.array(mask_img.convert('L'))
+			# Dilate 8 pixel so that all known pixel is trustworthy
+			if flow_mask_dilates > 0:
+				flow_mask_img = scipy.ndimage.binary_dilation(mask_img, iterations=flow_mask_dilates).astype(np.uint8)
+			else:
+				flow_mask_img = binary_mask(mask_img).astype(np.uint8)
+			flow_masks.append(Image.fromarray(flow_mask_img * 255))
+			if mask_dilates > 0:
+				mask_img = scipy.ndimage.binary_dilation(mask_img, iterations=mask_dilates).astype(np.uint8)
+			else:
+				mask_img = binary_mask(mask_img).astype(np.uint8)
+			masks_dilated.append(Image.fromarray(mask_img * 255))
+		if len(masks_img) == 1:
+			flow_masks = flow_masks * length
+			masks_dilated = masks_dilated * length
+		return flow_masks, masks_dilated
 class ProInpainter:
+	def __init__(self, propainter_checkpoint, raft_checkpoint, flow_completion_checkpoint, device="cuda:0", use_half=True):
+		self.device = device
+		self.use_half = use_half
+		if self.device == torch.device('cpu'):
+					self.use_half = False
+		##############################################
+		# set up RAFT and flow competition model
+		##############################################
+		self.fix_raft = RAFT_bi(raft_checkpoint, self.device)
+		self.fix_flow_complete = RecurrentFlowCompleteNet(flow_completion_checkpoint)
+		for p in self.fix_flow_complete.parameters():
+			p.requires_grad = False
+		self.fix_flow_complete.to(self.device)
+		self.fix_flow_complete.eval()
+		##############################################
+		# set up ProPainter model
+		##############################################
+		self.model = InpaintGenerator(model_path=propainter_checkpoint).to(self.device)
+		self.model.eval()
+		if self.use_half:
+			self.fix_flow_complete = self.fix_flow_complete.half()
+			self.model = self.model.half()
+	def inpaint(self, npframes, masks, ratio=1.0, dilate_radius=4, raft_iter=20, subvideo_length=80, neighbor_length=10, ref_stride=10):
+		"""
+		Perform Inpainting for video subsets
+		Output:
+		inpainted_frames: numpy array, T, H, W, 3
+		"""
+		frames = []
+		for i in range(len(npframes)):
+			frames.append(Image.fromarray(npframes[i].astype('uint8'), mode="RGB"))
+		del npframes
+		size = frames[0].size
+		# The ouput size should be divided by 2 so that it can encoded by libx264
+		if ratio !=1.0: size = (int(ratio*size[0])//2*2, int(ratio*size[1])//2*2)
+        else: size = (size[0])//2*2, size[1])//2*2)# set propainter size limit to 720 to reduce memory usage
+		frames_len = len(frames)
+		frames, size, out_size = resize_frames(frames, size)
+		flow_masks, masks_dilated = read_mask_demo(masks, frames_len, size, dilate_radius, dilate_radius)
+		w, h = size
+		frames_inp = [np.array(f).astype(np.uint8) for f in frames]
+		frames = to_tensors()(frames).unsqueeze(0) * 2 - 1
+		flow_masks = to_tensors()(flow_masks).unsqueeze(0)
+		masks_dilated = to_tensors()(masks_dilated).unsqueeze(0)
+		frames, flow_masks, masks_dilated = frames.to(self.device), flow_masks.to(self.device), masks_dilated.to(self.device)
+		##############################################
+		# ProPainter inference
+		##############################################
+		video_length = frames.size(1)
+		with torch.no_grad():
+			# ---- compute flow ----
+			if frames.size(-1) <= 640:
+				short_clip_len = 12
+			elif frames.size(-1) <= 720:
+				short_clip_len = 8
+			elif frames.size(-1) <= 1280:
+				short_clip_len = 4
+			else:
+				short_clip_len = 2
+			# use fp32 for RAFT
+			if frames.size(1) > short_clip_len:
+				gt_flows_f_list, gt_flows_b_list = [], []
+				for f in range(0, video_length, short_clip_len):
+					end_f = min(video_length, f + short_clip_len)
+					if f == 0:
+						flows_f, flows_b = self.fix_raft(frames[:,f:end_f], iters=raft_iter)
+					else:
+						flows_f, flows_b = self.fix_raft(frames[:,f-1:end_f], iters=raft_iter)
+					gt_flows_f_list.append(flows_f)
+					gt_flows_b_list.append(flows_b)
+					torch.cuda.empty_cache()
+				gt_flows_f = torch.cat(gt_flows_f_list, dim=1)
+				gt_flows_b = torch.cat(gt_flows_b_list, dim=1)
+				gt_flows_bi = (gt_flows_f, gt_flows_b)
+			else:
+				gt_flows_bi = self.fix_raft(frames, iters=raft_iter)
+				torch.cuda.empty_cache()
+			if self.use_half:
+				frames, flow_masks, masks_dilated = frames.half(), flow_masks.half(), masks_dilated.half()
+				gt_flows_bi = (gt_flows_bi[0].half(), gt_flows_bi[1].half())
+			# ---- complete flow ----
+			flow_length = gt_flows_bi[0].size(1)
+			if flow_length > subvideo_length:
+				pred_flows_f, pred_flows_b = [], []
+				pad_len = 5
+				for f in range(0, flow_length, subvideo_length):
+					s_f = max(0, f - pad_len)
+					e_f = min(flow_length, f + subvideo_length + pad_len)
+					pad_len_s = max(0, f) - s_f
+					pad_len_e = e_f - min(flow_length, f + subvideo_length)
+					pred_flows_bi_sub, _ = self.fix_flow_complete.forward_bidirect_flow(
+						(gt_flows_bi[0][:, s_f:e_f], gt_flows_bi[1][:, s_f:e_f]),
+						flow_masks[:, s_f:e_f+1])
+					pred_flows_bi_sub = self.fix_flow_complete.combine_flow(
+						(gt_flows_bi[0][:, s_f:e_f], gt_flows_bi[1][:, s_f:e_f]),
+						pred_flows_bi_sub,
+						flow_masks[:, s_f:e_f+1])
+					pred_flows_f.append(pred_flows_bi_sub[0][:, pad_len_s:e_f-s_f-pad_len_e])
+					pred_flows_b.append(pred_flows_bi_sub[1][:, pad_len_s:e_f-s_f-pad_len_e])
+					torch.cuda.empty_cache()
+				pred_flows_f = torch.cat(pred_flows_f, dim=1)
+				pred_flows_b = torch.cat(pred_flows_b, dim=1)
+				pred_flows_bi = (pred_flows_f, pred_flows_b)
+			else:
+				pred_flows_bi, _ = self.fix_flow_complete.forward_bidirect_flow(gt_flows_bi, flow_masks)
+				pred_flows_bi = self.fix_flow_complete.combine_flow(gt_flows_bi, pred_flows_bi, flow_masks)
+				torch.cuda.empty_cache()
+			# ---- image propagation ----
+			masked_frames = frames * (1 - masks_dilated)
+			subvideo_length_img_prop = min(100, subvideo_length) # ensure a minimum of 100 frames for image propagation
+			if video_length > subvideo_length_img_prop:
+				updated_frames, updated_masks = [], []
+				pad_len = 10
+				for f in range(0, video_length, subvideo_length_img_prop):
+					s_f = max(0, f - pad_len)
+					e_f = min(video_length, f + subvideo_length_img_prop + pad_len)
+					pad_len_s = max(0, f) - s_f
+					pad_len_e = e_f - min(video_length, f + subvideo_length_img_prop)
+					b, t, _, _, _ = masks_dilated[:, s_f:e_f].size()
+					pred_flows_bi_sub = (pred_flows_bi[0][:, s_f:e_f-1], pred_flows_bi[1][:, s_f:e_f-1])
+					prop_imgs_sub, updated_local_masks_sub = self.model.img_propagation(masked_frames[:, s_f:e_f],
+																		pred_flows_bi_sub,
+																		masks_dilated[:, s_f:e_f],
+																		'nearest')
+					updated_frames_sub = frames[:, s_f:e_f] * (1 - masks_dilated[:, s_f:e_f]) + \
+										prop_imgs_sub.view(b, t, 3, h, w) * masks_dilated[:, s_f:e_f]
+					updated_masks_sub = updated_local_masks_sub.view(b, t, 1, h, w)
+					updated_frames.append(updated_frames_sub[:, pad_len_s:e_f-s_f-pad_len_e])
+					updated_masks.append(updated_masks_sub[:, pad_len_s:e_f-s_f-pad_len_e])
+					torch.cuda.empty_cache()
+				updated_frames = torch.cat(updated_frames, dim=1)
+				updated_masks = torch.cat(updated_masks, dim=1)
+			else:
+				b, t, _, _, _ = masks_dilated.size()
+				prop_imgs, updated_local_masks = self.model.img_propagation(masked_frames, pred_flows_bi, masks_dilated, 'nearest')
+				updated_frames = frames * (1 - masks_dilated) + prop_imgs.view(b, t, 3, h, w) * masks_dilated
+				updated_masks = updated_local_masks.view(b, t, 1, h, w)
+				torch.cuda.empty_cache()
+		ori_frames = frames_inp
+		comp_frames = [None] * video_length
+		neighbor_stride = neighbor_length // 2
+		if video_length > subvideo_length:
+			ref_num = subvideo_length // ref_stride
+		else:
+			ref_num = -1
+		# ---- feature propagation + transformer ----
+		for f in tqdm(range(0, video_length, neighbor_stride)):
+			neighbor_ids = [
+				i for i in range(max(0, f - neighbor_stride),
+									min(video_length, f + neighbor_stride + 1))
+			]
+			ref_ids = get_ref_index(f, neighbor_ids, video_length, ref_stride, ref_num)
+			selected_imgs = updated_frames[:, neighbor_ids + ref_ids, :, :, :]
+			selected_masks = masks_dilated[:, neighbor_ids + ref_ids, :, :, :]
+			selected_update_masks = updated_masks[:, neighbor_ids + ref_ids, :, :, :]
+			selected_pred_flows_bi = (pred_flows_bi[0][:, neighbor_ids[:-1], :, :, :], pred_flows_bi[1][:, neighbor_ids[:-1], :, :, :])
+			with torch.no_grad():
+				# 1.0 indicates mask
+				l_t = len(neighbor_ids)
+				# pred_img = selected_imgs # results of image propagation
+				pred_img = self.model(selected_imgs, selected_pred_flows_bi, selected_masks, selected_update_masks, l_t)
+				pred_img = pred_img.view(-1, 3, h, w)
+				pred_img = (pred_img + 1) / 2
+				pred_img = pred_img.cpu().permute(0, 2, 3, 1).numpy() * 255
+				binary_masks = masks_dilated[0, neighbor_ids, :, :, :].cpu().permute(
+					0, 2, 3, 1).numpy().astype(np.uint8)
+				for i in range(len(neighbor_ids)):
+					idx = neighbor_ids[i]
+					img = np.array(pred_img[i]).astype(np.uint8) * binary_masks[i] \
+						+ ori_frames[idx] * (1 - binary_masks[i])
+					if comp_frames[idx] is None:
+						comp_frames[idx] = img
+					else:
+						comp_frames[idx] = comp_frames[idx].astype(np.float32) * 0.5 + img.astype(np.float32) * 0.5
+					comp_frames[idx] = comp_frames[idx].astype(np.uint8)
+			torch.cuda.empty_cache()
+		# need to return numpy array, T, H, W, 3
+		comp_frames = [cv2.resize(f, out_size) for f in comp_frames]
+		return comp_frames