diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -27,6 +27,7 @@ except: MAX_N = 6 FIX_MAX_N = 6 +LENGTH = 480 placeholder = cv2.cvtColor(cv2.imread("placeholder.png"), cv2.COLOR_BGR2RGB) NEW_MODEL = True @@ -200,16 +201,18 @@ if NEW_MODEL: if MODEL_EPOCH == 7: model_path = './DINO_EMA_11M_b50_lr1e-5_epoch7_step380k.ckpt' elif MODEL_EPOCH == 6: - # model_path = "./DINO_EMA_11M_b50_lr1e-5_epoch6_step320k.ckpt" - model_path = hf_hub_download(repo_id="Chaerin5/FoundHand-weights", filename="DINO_EMA_11M_b50_lr1e-5_epoch6_step320k.ckpt", token=token) + model_path = "./DINO_EMA_11M_b50_lr1e-5_epoch6_step320k.ckpt" + if not os.path.exists(model_path): + model_path = hf_hub_download(repo_id="Chaerin5/FoundHand-weights", filename="DINO_EMA_11M_b50_lr1e-5_epoch6_step320k.ckpt", token=token) elif MODEL_EPOCH == 4: model_path = "./DINO_EMA_11M_b50_lr1e-5_epoch4_step210k.ckpt" elif MODEL_EPOCH == 10: model_path = "./DINO_EMA_11M_b50_lr1e-5_epoch10_step550k.ckpt" else: raise ValueError(f"new model epoch should be either 6 or 7, got {MODEL_EPOCH}") - # vae_path = './vae-ft-mse-840000-ema-pruned.ckpt' - vae_path = hf_hub_download(repo_id="Chaerin5/FoundHand-weights", filename="vae-ft-mse-840000-ema-pruned.ckpt", token=token) + vae_path = './vae-ft-mse-840000-ema-pruned.ckpt' + if not os.path.exists(vae_path): + vae_path = hf_hub_download(repo_id="Chaerin5/FoundHand-weights", filename="vae-ft-mse-840000-ema-pruned.ckpt", token=token) # sd_path = './sd-v1-4.ckpt' print('Load diffusion model...') diffusion = create_diffusion(str(opts.test_sampling_steps)) @@ -242,7 +245,9 @@ if NEW_MODEL: print(f"autoencoder encoder after eval() dtype: {next(autoencoder.encoder.parameters()).dtype}") assert len(missing_keys) == 0 -sam_path = hf_hub_download(repo_id="Chaerin5/FoundHand-weights", filename="sam_vit_h_4b8939.pth", token=token) +sam_path = "sam_vit_h_4b8939.pth" +if not os.path.exists(sam_path): + sam_path = hf_hub_download(repo_id="Chaerin5/FoundHand-weights", filename="sam_vit_h_4b8939.pth", token=token) sam_predictor = init_sam(ckpt_path=sam_path, device=pre_device) print("Mediapipe hand detector and SAM ready...") @@ -252,12 +257,15 @@ hands = mp_hands.Hands( max_num_hands=2, # Maximum number of hands to detect min_detection_confidence=0.1, ) +no_hands_open = cv2.resize(np.array(Image.open("no_hands_open.jpeg"))[..., :3], (LENGTH, LENGTH)) -def prepare_anno(ref): - if ref is None: +def prepare_anno(ref, ref_is_user): + if not ref_is_user: # no_hand_open.jpeg + return gr.update(value=None), gr.update(value=None) + if ref is None or ref["background"] is None or ref["background"].sum()==0: # clear_all return ( - gr.Image.update(value=None), - gr.State.update(value=None), + gr.update(value=None), + gr.update(value=None), ) img = ref["composite"][..., :3] img = cv2.resize(img, opts.image_size, interpolation=cv2.INTER_AREA) @@ -286,9 +294,11 @@ def prepare_anno(ref): return img, None def get_ref_anno(img, keypts): - if keypts is None: + if img.sum() == 0: # clear_all + return None, gr.update(), None, gr.update(), True + elif keypts is None: # hand not detected no_hands = cv2.resize(np.array(Image.open("no_hands.png"))[..., :3], (LENGTH, LENGTH)) - return None, no_hands, None + return None, no_hands, None, no_hands_open, False missing_keys, extra_keys = autoencoder.load_state_dict(vae_state_dict, strict=False) if isinstance(keypts, list): if len(keypts[0]) == 0: @@ -297,29 +307,40 @@ def get_ref_anno(img, keypts): keypts[0] = np.array(keypts[0], dtype=np.float32) else: gr.Info("Number of right hand keypoints should be either 0 or 21.") - return None, None, None + return None, None, None, gr.update(), gr.update() if len(keypts[1]) == 0: keypts[1] = np.zeros((21, 2)) elif len(keypts[1]) == 21: keypts[1] = np.array(keypts[1], dtype=np.float32) else: gr.Info("Number of left hand keypoints should be either 0 or 21.") - return None, None, None + return None, None, None, gr.update(), gr.update() keypts = np.concatenate(keypts, axis=0) if REF_POSE_MASK: sam_predictor.set_image(img) if keypts[0].sum() != 0 and keypts[21].sum() != 0: - input_point = np.array([keypts[0], keypts[21]]) - input_label = np.array([1, 1]) + # input_point = np.array([keypts[0], keypts[21]]) + input_point = np.array(keypts) + input_box = np.stack([keypts.min(axis=0), keypts.max(axis=0)]) + # input_label = np.array([1, 1]) elif keypts[0].sum() != 0: - input_point = np.array(keypts[:1]) - input_label = np.array([1]) + # input_point = np.array(keypts[:1]) + input_point = np.array(keypts[:21]) + input_box = np.stack([keypts[:21].min(axis=0), keypts[:21].max(axis=0)]) + # input_label = np.array([1]) elif keypts[21].sum() != 0: - input_point = np.array(keypts[21:22]) - input_label = np.array([1]) + input_point = np.array(keypts[21:]) + # input_label = np.array([1]) + input_box = np.stack([keypts[21:].min(axis=0), keypts[21:].max(axis=0)]) + input_label = np.ones_like(input_point[:, 0]).astype(np.int32) + box_shift_ratio = 0.5 + box_size_factor = 1.2 + box_trans = input_box[0] * box_shift_ratio + input_box[1] * (1 - box_shift_ratio) + input_box = ((input_box - box_trans) * box_size_factor + box_trans).reshape(-1) masks, _, _ = sam_predictor.predict( point_coords=input_point, point_labels=input_label, + box=input_box[None, :], multimask_output=False, ) hand_mask = masks[0] @@ -388,12 +409,14 @@ def get_ref_anno(img, keypts): ref_cond = torch.cat([latent, heatmaps, mask], 1) print(f"ref_cond.max(): {ref_cond.max()}, ref_cond.min(): {ref_cond.min()}") - return img, ref_pose, ref_cond + return img, ref_pose, ref_cond, gr.update(), True def get_target_anno(img, keypts): - if keypts is None: + if img.sum() == 0: # clear_all + return None, gr.update(), None, gr.update(), True + if keypts is None: # hands not detected no_hands = cv2.resize(np.array(Image.open("no_hands.png"))[..., :3], (LENGTH, LENGTH)) - return None, no_hands, None, None + return None, no_hands, None, None, no_hands_open, False if isinstance(keypts, list): if len(keypts[0]) == 0: keypts[0] = np.zeros((21, 2)) @@ -401,14 +424,14 @@ def get_target_anno(img, keypts): keypts[0] = np.array(keypts[0], dtype=np.float32) else: gr.Info("Number of right hand keypoints should be either 0 or 21.") - return None, None, None + return None, None, None, gr.update(), gr.update(), gr.update() if len(keypts[1]) == 0: keypts[1] = np.zeros((21, 2)) elif len(keypts[1]) == 21: keypts[1] = np.array(keypts[1], dtype=np.float32) else: gr.Info("Number of left hand keypoints should be either 0 or 21.") - return None, None, None + return None, None, None, gr.update(), gr.update(), gr.update() keypts = np.concatenate(keypts, axis=0) target_pose = visualize_hand(keypts, img) kpts_valid = check_keypoints_validity(keypts, opts.image_size) @@ -426,26 +449,29 @@ def get_target_anno(img, keypts): [target_heatmaps, torch.zeros_like(target_heatmaps)[:, :1]], 1 ) - return img, target_pose, target_cond, keypts + return img, target_pose, target_cond, keypts, gr.update(), True +# def get_mask_inpaint(ref): +# # inpaint_mask = np.zeros_like(img_original[:, :, 0]) +# # cropped_mask = np.array(ref["layers"][0])[..., -1] +# # inpaint_mask[crop_coord[0][1]:crop_coord[1][1], crop_coord[0][0]:crop_coord[1][0]] = cropped_mask -def get_mask_inpaint(ref): - # inpaint_mask = np.zeros_like(img_original[:, :, 0]) - # cropped_mask = np.array(ref["layers"][0])[..., -1] - # inpaint_mask[crop_coord[0][1]:crop_coord[1][1], crop_coord[0][0]:crop_coord[1][0]] = cropped_mask +# return inpaint_mask + +def visualize_ref(ref): + if ref is None: + return None + + # inpaint mask inpaint_mask = np.array(ref["layers"][0])[..., -1] inpaint_mask = cv2.resize( inpaint_mask, opts.image_size, interpolation=cv2.INTER_AREA ) inpaint_mask = (inpaint_mask >= 128).astype(np.uint8) - return inpaint_mask - -def visualize_ref(brush): # crop, - if brush is None: # crop is None or - return None - inpainted = brush["layers"][0][..., -1] - img = brush["background"][..., :3] + # viualization + inpainted = ref["layers"][0][..., -1] + img = ref["background"][..., :3] # img = cv2.resize(img, inpainted.shape[::-1], interpolation=cv2.INTER_AREA) mask = inpainted < 128 # img = img.astype(np.int32) @@ -453,7 +479,7 @@ def visualize_ref(brush): # crop, # img[np.any(img<0, axis=-1)]=0 # img = img.astype(np.uint8) img = mask_image(img, mask) - return img + return img, inpaint_mask def get_kps(img, keypoints, side: Literal["right", "left"], evt: gr.SelectData): @@ -745,6 +771,8 @@ def sample_inpaint( cfg, quality, ): + if keypts is None: + return None, None, None set_seed(seed) N = num_gen jump_length = 10 @@ -928,14 +956,14 @@ def enable_component(image1, image2): if image1 is None or image2 is None: return gr.update(interactive=False) if isinstance(image1, dict) and "background" in image1 and "layers" in image1 and "composite" in image1: - if ( + if image1["background"] is None or ( image1["background"].sum() == 0 and (sum([im.sum() for im in image1["layers"]]) == 0) and image1["composite"].sum() == 0 ): return gr.update(interactive=False) if isinstance(image1, dict) and "background" in image2 and "layers" in image2 and "composite" in image2: - if ( + if image2["background"] is None or ( image2["background"].sum() == 0 and (sum([im.sum() for im in image2["layers"]]) == 0) and image2["composite"].sum() == 0 @@ -1029,13 +1057,6 @@ def fix_set_unvisible(): gr.update(visible=False) ) -def set_no_hands(decider, component): - if decider is None: - no_hands = cv2.resize(np.array(Image.open("no_hands.png"))[..., :3], (LENGTH, LENGTH)) - return no_hands - else: - return component - def visible_component(decider, component): if decider is not None: update_component = gr.update(visible=True) @@ -1050,7 +1071,6 @@ def unvisible_component(decider, component): update_component = gr.update(visible=True) return update_component -LENGTH = 480 example_ref_imgs = [ [ @@ -1160,20 +1180,39 @@ custom_css = """ width: 240px !important; height: 240px !important; } +#fix-tab-button { + font-size: 18px !important; + font-weight: bold !important; + background-color: #FFDAB9 !important; +} +#repose-tab-button { + font-size: 18px !important; + font-weight: bold !important; + background-color: #90EE90 !important; +} """ +# color: black !important; _HEADER_ = '''
- Brown University - Meta Reality Labs + Kefan Chen1,2* + Chaerin Min1* + Linguang Zhang2 + Shreyas Hampali2 + Cem Keskin2 + Srinath Sridhar1 +
++ 1Brown University + 2Meta Reality Labs
Below are two important abilities of our model. First, we can edit hand poses given two hand images - one is the image to edit, and the other one provides target hand pose. Second, we can automatically fix malformed hand images, following the user-provided target hand pose and area to fix.
+Below are two important abilities of our model. First, we can automatically fix malformed hand images, following the user-provided target hand pose and area to fix. Second, we can repose hand given two hand images - one is the image to edit, and the other one provides target hand pose.
@article{chen2024foundhand, - title={FoundHand: Large-Scale Domain-Specific Learning for Controllable Hand Image Generation}, - author={Chen, Kefan and Min, Chaerin and Zhang, Linguang and Hampali, Shreyas and Keskin, Cem and Sridhar, Srinath}, - journal={arXiv preprint arXiv:2412.02690}, - year={2024} + title={FoundHand: Large-Scale Domain-Specific Learning for Controllable Hand Image Generation}, + author={Chen, Kefan and Min, Chaerin and Zhang, Linguang and Hampali, Shreyas and Keskin, Cem and Sridhar, Srinath}, + journal={arXiv preprint arXiv:2412.02690}, + year={2024} } -``` ++""" +_ACK_ = r""" +
+Part of this work was done during Kefan (Arthur) Chenβs internship at Meta Reality Lab. This work was additionally supported by NSF CAREER grant #2143576, NASA grant #80NSSC23M0075, and an Amazon Cloud Credits Award. +""" with gr.Blocks(css=custom_css, theme="soft") as demo: gr.Markdown(_HEADER_) - with gr.Tab("Edit Hand Poses"): - dump = gr.State(value=None) - - # ref states - ref_img = gr.State(value=None) - ref_im_raw = gr.State(value=None) - ref_kp_raw = gr.State(value=0) - ref_kp_got = gr.State(value=None) - ref_manual_cond = gr.State(value=None) - ref_auto_cond = gr.State(value=None) - ref_cond = gr.State(value=None) - - # target states - target_img = gr.State(value=None) - target_im_raw = gr.State(value=None) - target_kp_raw = gr.State(value=0) - target_kp_got = gr.State(value=None) - target_manual_keypts = gr.State(value=None) - target_auto_keypts = gr.State(value=None) - target_keypts = gr.State(value=None) - target_manual_cond = gr.State(value=None) - target_auto_cond = gr.State(value=None) - target_cond = gr.State(value=None) - - # main tab + with gr.Tab("Demo 1. Malformed Hand Correction", elem_id="fix-tab"): + # gr.Markdown("""
Demo 1. Malformed Hand Correction
""") + fix_inpaint_mask = gr.State(value=None) + fix_original = gr.State(value=None) + fix_crop_coord = gr.State(value=None) + fix_img = gr.State(value=None) + fix_kpts = gr.State(value=None) + fix_kpts_np = gr.State(value=None) + fix_ref_cond = gr.State(value=None) + fix_target_cond = gr.State(value=None) + fix_latent = gr.State(value=None) + fix_inpaint_latent = gr.State(value=None) with gr.Row(): - # ref column + # crop & brush with gr.Column(): gr.Markdown( - """1. Upload a hand image to edit π₯
""" + """1. Upload a malformed hand image π₯
""" ) gr.Markdown( - """① Optionally crop the image
""" + """Optionally crop the image.
(Click top left and bottom right of your desired bounding box around the hand)
② Hit the "Finish Cropping" button to get hand pose
""" + interactive=True, + visible=True, ) - ref_finish_crop = gr.Button(value="Finish Cropping", interactive=False) - with gr.Tab("Automatic hand keypoints"): - ref_pose = gr.Image( - type="numpy", - label="Reference Pose", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - ) - ref_use_auto = gr.Button(value="Click here to use automatic, not manual", interactive=False, visible=True) - with gr.Tab("Manual hand keypoints"): - ref_manual_checkbox_info = gr.Markdown( - """Step 1. Tell us if this is right, left, or both hands.
""", - visible=True, - ) - ref_manual_checkbox = gr.CheckboxGroup( - ["Right hand", "Left hand"], - show_label=False, - visible=True, - interactive=True, - ) - ref_manual_kp_r_info = gr.Markdown( - """Step 2. Click on image to provide hand keypoints for right hand. See \"OpenPose Keypoint Convention\" for guidance.
""", - visible=False, - ) - ref_manual_kp_right = gr.Image( - type="numpy", - label="Keypoint Selection (right hand)", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - visible=False, - sources=[], - ) - with gr.Row(): - ref_manual_undo_right = gr.Button( - value="Undo", interactive=True, visible=False - ) - ref_manual_reset_right = gr.Button( - value="Reset", interactive=True, visible=False - ) - ref_manual_kp_l_info = gr.Markdown( - """Step 2. Click on image to provide hand keypoints for left hand. See \"OpenPose keypoint convention\" for guidance.
""", - visible=False - ) - ref_manual_kp_left = gr.Image( - type="numpy", - label="Keypoint Selection (left hand)", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - visible=False, - sources=[], - ) - with gr.Row(): - ref_manual_undo_left = gr.Button( - value="Undo", interactive=True, visible=False - ) - ref_manual_reset_left = gr.Button( - value="Reset", interactive=True, visible=False - ) - ref_manual_done_info = gr.Markdown( - """Step 3. Hit \"Done\" button to confirm.
""", - visible=False, - ) - ref_manual_done = gr.Button(value="Done", interactive=True, visible=False) - ref_manual_pose = gr.Image( - type="numpy", - label="Reference Pose", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - visible=False - ) - ref_use_manual = gr.Button(value="Click here to use manual, not automatic", interactive=True, visible=False) - ref_manual_instruct = gr.Markdown( - value="""OpenPose Keypoints Convention
""", - visible=True - ) - ref_manual_openpose = gr.Image( - value="openpose.png", - type="numpy", - show_label=False, - height=LENGTH // 2, - width=LENGTH // 2, - interactive=False, - visible=True - ) gr.Markdown( - """③ Optionally flip the hand
""" + """π‘ If you crop, the model can focus on more details of the cropped area. Square crops might work better than rectangle crops.
""" ) - ref_flip = gr.Checkbox( - value=False, label="Flip Handedness (Reference)", interactive=False + # fix_tmp = gr.Image( + # type="numpy", + # label="tmp", + # show_label=True, + # height=LENGTH, + # width=LENGTH, + # interactive=True, + # visible=True, + # sources=[], + # ) + fix_example = gr.Examples( + fix_example_imgs, + inputs=[fix_crop], + examples_per_page=20, ) - - # target column with gr.Column(): gr.Markdown( - """2. Upload a hand image for target hand pose π₯
""" + """2. Brush wrong finger and its surrounding area
""" ) gr.Markdown( - """① Optionally crop the image
""" + """Don't brush the entire hand!
""" ) - target = gr.ImageEditor( + fix_ref = gr.ImageEditor( type="numpy", - label="Target", + label="Image Brushing", + sources=(), show_label=True, height=LENGTH, width=LENGTH, - brush=False, layers=False, - crop_size="1:1", - ) - gr.Examples(example_target_imgs, [target], examples_per_page=20) - gr.Markdown( - """② Hit the "Finish Cropping" button to get hand pose
""" - ) - target_finish_crop = gr.Button( - value="Finish Cropping", interactive=False - ) - with gr.Tab("Automatic hand keypoints"): - target_pose = gr.Image( - type="numpy", - label="Target Pose", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - ) - target_use_auto = gr.Button(value="Click here to use automatic, not manual", interactive=False, visible=True) - with gr.Tab("Manual hand keypoints"): - target_manual_checkbox_info = gr.Markdown( - """Step 1. Tell us if this is right, left, or both hands.
""", - visible=True, - ) - target_manual_checkbox = gr.CheckboxGroup( - ["Right hand", "Left hand"], - show_label=False, - visible=True, - interactive=True, - ) - target_manual_kp_r_info = gr.Markdown( - """Step 2. Click on image to provide hand keypoints for right hand. See \"OpenPose Keypoint Convention\" for guidance.
""", - visible=False, - ) - target_manual_kp_right = gr.Image( - type="numpy", - label="Keypoint Selection (right hand)", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - visible=False, - sources=[], - ) - with gr.Row(): - target_manual_undo_right = gr.Button( - value="Undo", interactive=True, visible=False - ) - target_manual_reset_right = gr.Button( - value="Reset", interactive=True, visible=False - ) - target_manual_kp_l_info = gr.Markdown( - """Step 2. Click on image to provide hand keypoints for left hand. See \"OpenPose keypoint convention\" for guidance.
""", - visible=False - ) - target_manual_kp_left = gr.Image( - type="numpy", - label="Keypoint Selection (left hand)", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - visible=False, - sources=[], - ) - with gr.Row(): - target_manual_undo_left = gr.Button( - value="Undo", interactive=True, visible=False - ) - target_manual_reset_left = gr.Button( - value="Reset", interactive=True, visible=False - ) - target_manual_done_info = gr.Markdown( - """Step 3. Hit \"Done\" button to confirm.
""", - visible=False, - ) - target_manual_done = gr.Button(value="Done", interactive=True, visible=False) - target_manual_pose = gr.Image( - type="numpy", - label="Target Pose", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - visible=False - ) - target_use_manual = gr.Button(value="Click here to use manual, not automatic", interactive=True, visible=False) - target_manual_instruct = gr.Markdown( - value="""OpenPose Keypoints Convention
""", - visible=True - ) - target_manual_openpose = gr.Image( - value="openpose.png", - type="numpy", - show_label=False, - height=LENGTH // 2, - width=LENGTH // 2, - interactive=False, - visible=True - ) - gr.Markdown( - """③ Optionally flip the hand
""" + transforms=("brush"), + brush=gr.Brush( + colors=["rgb(255, 255, 255)"], default_size=20 + ), # 204, 50, 50 + image_mode="RGBA", + container=False, + interactive=False, ) - target_flip = gr.Checkbox( - value=False, label="Flip Handedness (Target)", interactive=False + # gr.Markdown( + # """③ Hit the \"Finish Cropping & Brushing\" button
""" + # ) + fix_finish_crop = gr.Button( + value="Finish Croping & Brushing", interactive=False ) - - # result column + + # keypoint selection with gr.Column(): gr.Markdown( - """3. Press "Run" to get the edited results π―
""" + """3. Click on hand to get target hand pose
""" ) - run = gr.Button(value="Run", interactive=False) gr.Markdown( - """β οΈ ~20s per generation with RTX3090. ~50s with A100.
(For example, if you set Number of generations as 2, it would take around 40s)
① Tell us if this is right, left, or both hands
""" ) - results = gr.Gallery( + fix_checkbox = gr.CheckboxGroup( + ["Right hand", "Left hand"], + show_label=False, + interactive=False, + ) + fix_kp_r_info = gr.Markdown( + """② Click 21 keypoints on the image to provide the target hand pose of right hand. See the \"OpenPose keypoints convention\" for guidance.
""", + visible=False + ) + # fix_kp_r_info = gr.Markdown( + # """Select right only
""", + # visible=False, + # ) + fix_kp_right = gr.Image( type="numpy", - label="Results", + label="Keypoint Selection (right hand)", show_label=True, height=LENGTH, - min_width=LENGTH, - columns=MAX_N, + width=LENGTH, interactive=False, - preview=True, + visible=False, + sources=[], + ) + with gr.Row(): + fix_undo_right = gr.Button( + value="Undo", interactive=False, visible=False + ) + fix_reset_right = gr.Button( + value="Reset", interactive=False, visible=False + ) + fix_kp_l_info = gr.Markdown( + """② Click 21 keypoints on the image to provide the target hand pose of left hand. See the \"OpenPose keypoints convention\" for guidance.
""", + visible=False ) - results_pose = gr.Gallery( + fix_kp_left = gr.Image( type="numpy", - label="Results Pose", + label="Keypoint Selection (left hand)", show_label=True, height=LENGTH, - min_width=LENGTH, - columns=MAX_N, + width=LENGTH, interactive=False, - preview=True, + visible=False, + sources=[], ) + with gr.Row(): + fix_undo_left = gr.Button( + value="Undo", interactive=False, visible=False + ) + fix_reset_left = gr.Button( + value="Reset", interactive=False, visible=False + ) gr.Markdown( - """β¨ Hit "Clear" to restart from the beginning
""" + """OpenPose keypoints convention
""" ) - clear = gr.ClearButton() + fix_openpose = gr.Image( + value="openpose.png", + type="numpy", + show_label=False, + height=LENGTH // 2, + width=LENGTH // 2, + interactive=False, + ) + + # get latent + # with gr.Column(): + + # result column + with gr.Column(): + gr.Markdown( + """4. Press "Run" to get the corrected hand image π―
""" + ) + # gr.Markdown( + # """3. Press "Ready" to start pre-processing
""" + # ) + # fix_ready = gr.Button(value="Ready", interactive=False) + # gr.Markdown( + # """Visualized (256, 256)-resized, brushed image
""" + # ) + fix_vis_mask32 = gr.Image( + type="numpy", + label=f"Visualized {opts.latent_size} Inpaint Mask", + show_label=True, + height=opts.latent_size, + width=opts.latent_size, + interactive=False, + visible=False, + ) + fix_run = gr.Button(value="Run", interactive=False) + with gr.Accordion(label="Visualized (256, 256) resized, brushed image", open=False): + fix_vis_mask256 = gr.Image( + type="numpy", + show_label=False, + height=opts.image_size, + width=opts.image_size, + interactive=False, + visible=True, + ) + # gr.Markdown( + # """[NOTE] Above should be inpaint mask that you brushed, NOT the segmentation mask of the entire hand.
""" + # ) + gr.Markdown( + """β οΈ >3min and ~24GB per generation
""" + ) + fix_result_original = gr.Gallery( + type="numpy", + label="Results on original input", + show_label=True, + height=LENGTH, + min_width=LENGTH, + columns=FIX_MAX_N, + interactive=False, + preview=True, + ) + with gr.Accordion(label="Results of cropped area / Results with pose", open=False): + fix_result = gr.Gallery( + type="numpy", + label="Results", + show_label=True, + height=LENGTH, + min_width=LENGTH, + columns=FIX_MAX_N, + interactive=False, + preview=True, + ) + fix_result_pose = gr.Gallery( + type="numpy", + label="Results Pose", + show_label=True, + height=LENGTH, + min_width=LENGTH, + columns=FIX_MAX_N, + interactive=False, + preview=True, + ) + gr.Markdown( + """β¨ Hit "Clear" to restart from the beginning
""" + ) + fix_clear = gr.ClearButton() - with gr.Tab("More options"): + with gr.Accordion(label="More options", open=False): + gr.Markdown( + "β οΈ Currently, Number of generation > 1 could lead to out-of-memory" + ) with gr.Row(): - n_generation = gr.Slider( + fix_n_generation = gr.Slider( label="Number of generations", value=1, minimum=1, - maximum=MAX_N, + maximum=FIX_MAX_N, step=1, randomize=False, interactive=True, ) - seed = gr.Slider( + fix_seed = gr.Slider( label="Seed", value=42, minimum=0, @@ -1522,468 +1505,496 @@ with gr.Blocks(css=custom_css, theme="soft") as demo: randomize=False, interactive=True, ) - cfg = gr.Slider( + fix_cfg = gr.Slider( label="Classifier free guidance scale", - value=2.5, + value=3.0, minimum=0.0, maximum=10.0, step=0.1, randomize=False, interactive=True, ) - - # reference listeners - ref.change(enable_component, [ref, ref], ref_finish_crop) - ref_finish_crop.click(prepare_anno, [ref], [ref_im_raw, ref_kp_raw]) - ref_kp_raw.change(lambda x: x, ref_im_raw, ref_manual_kp_right) - ref_kp_raw.change(lambda x: x, ref_im_raw, ref_manual_kp_left) - ref_kp_raw.change(get_ref_anno, [ref_im_raw, ref_kp_raw], [ref_img, ref_pose, ref_auto_cond]) - ref_pose.change(enable_component, [ref_kp_raw, ref_pose], ref_use_auto) - ref_pose.change(enable_component, [ref_img, ref_pose], ref_flip) - ref_auto_cond.change(lambda x: x, ref_auto_cond, ref_cond) - ref_use_auto.click(lambda x: x, ref_auto_cond, ref_cond) - ref_use_auto.click(lambda x: gr.Info("Automatic hand keypoints will be used for 'Reference'", duration=3)) + fix_quality = gr.Slider( + label="Quality", + value=10, + minimum=1, + maximum=10, + step=1, + randomize=False, + interactive=True, + ) - ref_manual_checkbox.select( - set_visible, - [ref_manual_checkbox, ref_kp_got, ref_im_raw, ref_manual_kp_right, ref_manual_kp_left, ref_manual_done], - [ - ref_kp_got, - ref_manual_kp_right, - ref_manual_kp_left, - ref_manual_kp_right, - ref_manual_undo_right, - ref_manual_reset_right, - ref_manual_kp_left, - ref_manual_undo_left, - ref_manual_reset_left, - ref_manual_kp_r_info, - ref_manual_kp_l_info, - ref_manual_done, - ref_manual_done_info - ] + # listeners + # fix_crop.change(resize_to_full, fix_crop, fix_ref) + fix_crop.change(lambda x: x, fix_crop, fix_original) # fix_original: (real_H, real_W, 3) + fix_crop.change(stay_crop, [fix_crop, fix_crop_coord], [fix_crop_coord, fix_ref]) + fix_crop.select(process_crop, [fix_crop, fix_crop_coord], [fix_crop_coord, fix_ref]) + # fix_ref.change(disable_crop, fix_crop_coord, fix_crop) + fix_ref.change(enable_component, [fix_crop, fix_crop], fix_ref) + fix_ref.change(enable_component, [fix_crop, fix_crop], fix_finish_crop) + fix_finish_crop.click(visualize_ref, [fix_ref], [fix_img, fix_inpaint_mask]) + # fix_finish_crop.click(get_mask_inpaint, [fix_ref], []) # fix_ref: (real_cropped_H, real_cropped_W, 3) + fix_img.change(lambda x: x, [fix_img], [fix_kp_right]) + fix_img.change(lambda x: x, [fix_img], [fix_kp_left]) + fix_inpaint_mask.change( + enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_checkbox ) - ref_manual_kp_right.select( - get_kps, [ref_im_raw, ref_kp_got, gr.State("right")], [ref_manual_kp_right, ref_kp_got] + fix_inpaint_mask.change( + enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_kp_right ) - ref_manual_undo_right.click( - undo_kps, [ref_im_raw, ref_kp_got, gr.State("right")], [ref_manual_kp_right, ref_kp_got] + fix_inpaint_mask.change( + enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_undo_right ) - ref_manual_reset_right.click( - reset_kps, [ref_im_raw, ref_kp_got, gr.State("right")], [ref_manual_kp_right, ref_kp_got] + fix_inpaint_mask.change( + enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_reset_right ) - ref_manual_kp_left.select( - get_kps, [ref_im_raw, ref_kp_got, gr.State("left")], [ref_manual_kp_left, ref_kp_got] + fix_inpaint_mask.change( + enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_kp_left ) - ref_manual_undo_left.click( - undo_kps, [ref_im_raw, ref_kp_got, gr.State("left")], [ref_manual_kp_left, ref_kp_got] + fix_inpaint_mask.change( + enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_undo_left ) - ref_manual_reset_left.click( - reset_kps, [ref_im_raw, ref_kp_got, gr.State("left")], [ref_manual_kp_left, ref_kp_got] + fix_inpaint_mask.change( + enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_reset_left ) - ref_manual_done.click(visible_component, [gr.State(0), ref_manual_pose], ref_manual_pose) - ref_manual_done.click(visible_component, [gr.State(0), ref_use_manual], ref_use_manual) - ref_manual_done.click(get_ref_anno, [ref_im_raw, ref_kp_got], [ref_img, ref_manual_pose, ref_manual_cond]) - ref_manual_pose.change(enable_component, [ref_manual_pose, ref_manual_pose], ref_manual_done) - ref_manual_pose.change(enable_component, [ref_img, ref_manual_pose], ref_flip) - ref_manual_cond.change(lambda x: x, ref_manual_cond, ref_cond) - ref_use_manual.click(lambda x: x, ref_manual_cond, ref_cond) - ref_use_manual.click(lambda x: gr.Info("Manual hand keypoints will be used for 'Reference'", duration=3)) - - ref_flip.select( - flip_hand, - [ref, ref_im_raw, ref_pose, ref_manual_pose, ref_manual_kp_right, ref_manual_kp_left, ref_cond, ref_auto_cond, ref_manual_cond], - [ref, ref_im_raw, ref_pose, ref_manual_pose, ref_manual_kp_right, ref_manual_kp_left, ref_cond, ref_auto_cond, ref_manual_cond] + fix_inpaint_mask.change( + enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_run ) - - # target listeners - target.change(enable_component, [target, target], target_finish_crop) - target_finish_crop.click(prepare_anno, [target], [target_im_raw, target_kp_raw]) - target_kp_raw.change(lambda x:x, target_im_raw, target_manual_kp_right) - target_kp_raw.change(lambda x:x, target_im_raw, target_manual_kp_left) - target_kp_raw.change(get_target_anno, [target_im_raw, target_kp_raw], [target_img, target_pose, target_auto_cond, target_auto_keypts]) - target_pose.change(enable_component, [target_kp_raw, target_pose], target_use_auto) - target_pose.change(enable_component, [target_img, target_pose], target_flip) - target_auto_cond.change(lambda x: x, target_auto_cond, target_cond) - target_auto_keypts.change(lambda x: x, target_auto_keypts, target_keypts) - target_use_auto.click(lambda x: x, target_auto_cond, target_cond) - target_use_auto.click(lambda x: x, target_auto_keypts, target_keypts) - target_use_auto.click(lambda x: gr.Info("Automatic hand keypoints will be used for 'Target'", duration=3)) - - target_manual_checkbox.select( + fix_checkbox.select( set_visible, - [target_manual_checkbox, target_kp_got, target_im_raw, target_manual_kp_right, target_manual_kp_left, target_manual_done], + [fix_checkbox, fix_kpts, fix_img, fix_kp_right, fix_kp_left], [ - target_kp_got, - target_manual_kp_right, - target_manual_kp_left, - target_manual_kp_right, - target_manual_undo_right, - target_manual_reset_right, - target_manual_kp_left, - target_manual_undo_left, - target_manual_reset_left, - target_manual_kp_r_info, - target_manual_kp_l_info, - target_manual_done, - target_manual_done_info - ] - ) - target_manual_kp_right.select( - get_kps, [target_im_raw, target_kp_got, gr.State("right")], [target_manual_kp_right, target_kp_got] + fix_kpts, + fix_kp_right, + fix_kp_left, + fix_kp_right, + fix_undo_right, + fix_reset_right, + fix_kp_left, + fix_undo_left, + fix_reset_left, + fix_kp_r_info, + fix_kp_l_info, + ], ) - target_manual_undo_right.click( - undo_kps, [target_im_raw, target_kp_got, gr.State("right")], [target_manual_kp_right, target_kp_got] + fix_kp_right.select( + get_kps, [fix_img, fix_kpts, gr.State("right")], [fix_kp_right, fix_kpts] # fix_img: (real_cropped_H, real_cropped_W, 3) ) - target_manual_reset_right.click( - reset_kps, [target_im_raw, target_kp_got, gr.State("right")], [target_manual_kp_right, target_kp_got] + fix_undo_right.click( + undo_kps, [fix_img, fix_kpts, gr.State("right")], [fix_kp_right, fix_kpts] ) - target_manual_kp_left.select( - get_kps, [target_im_raw, target_kp_got, gr.State("left")], [target_manual_kp_left, target_kp_got] + fix_reset_right.click( + reset_kps, [fix_img, fix_kpts, gr.State("right")], [fix_kp_right, fix_kpts] ) - target_manual_undo_left.click( - undo_kps, [target_im_raw, target_kp_got, gr.State("left")], [target_manual_kp_left, target_kp_got] + fix_kp_left.select( + get_kps, [fix_img, fix_kpts, gr.State("left")], [fix_kp_left, fix_kpts] ) - target_manual_reset_left.click( - reset_kps, [target_im_raw, target_kp_got, gr.State("left")], [target_manual_kp_left, target_kp_got] + fix_undo_left.click( + undo_kps, [fix_img, fix_kpts, gr.State("left")], [fix_kp_left, fix_kpts] ) - target_manual_done.click(visible_component, [gr.State(0), target_manual_pose], target_manual_pose) - target_manual_done.click(visible_component, [gr.State(0), target_use_manual], target_use_manual) - target_manual_done.click(get_target_anno, [target_im_raw, target_kp_got], [target_img, target_manual_pose, target_manual_cond, target_manual_keypts]) - target_manual_pose.change(enable_component, [target_manual_pose, target_manual_pose], target_manual_done) - target_manual_pose.change(enable_component, [target_img, target_manual_pose], target_flip) - target_manual_cond.change(lambda x: x, target_manual_cond, target_cond) - target_manual_keypts.change(lambda x: x, target_manual_keypts, target_keypts) - target_use_manual.click(lambda x: x, target_manual_cond, target_cond) - target_use_manual.click(lambda x: x, target_manual_keypts, target_keypts) - target_use_manual.click(lambda x: gr.Info("Manual hand keypoints will be used for 'Reference'", duration=3)) - - target_flip.select( - flip_hand, - [target, target_im_raw, target_pose, target_manual_pose, target_manual_kp_right, target_manual_kp_left, target_cond, target_auto_cond, target_manual_cond, target_keypts, target_auto_keypts, target_manual_keypts], - [target, target_im_raw, target_pose, target_manual_pose, target_manual_kp_right, target_manual_kp_left, target_cond, target_auto_cond, target_manual_cond, target_keypts, target_auto_keypts, target_manual_keypts], + fix_reset_left.click( + reset_kps, [fix_img, fix_kpts, gr.State("left")], [fix_kp_left, fix_kpts] ) - - # run listerners - ref_cond.change(enable_component, [ref_cond, target_cond], run) - target_cond.change(enable_component, [ref_cond, target_cond], run) - # ref_manual_pose.change(enable_component, [ref_manual_pose, target_manual_pose], run) - # target_manual_pose.change(enable_component, [ref_manual_pose, target_manual_pose], run) - run.click( - sample_diff, - [ref_cond, target_cond, target_keypts, n_generation, seed, cfg], - [results, results_pose], + # fix_vis_mask32.change( + # enable_component, [fix_vis_mask32, fix_vis_mask256], fix_run + # ) + fix_run.click( + ready_sample, + [fix_ref, fix_inpaint_mask, fix_kpts], + [ + fix_ref_cond, + fix_target_cond, + fix_latent, + fix_inpaint_latent, + fix_kpts_np, + fix_vis_mask32, + fix_vis_mask256, + ], ) - clear.click( - clear_all, + fix_kpts_np.change( + sample_inpaint, + [ + fix_ref_cond, + fix_target_cond, + fix_latent, + fix_inpaint_latent, + fix_kpts_np, + fix_original, + fix_crop_coord, + fix_n_generation, + fix_seed, + fix_cfg, + fix_quality, + ], + [fix_result, fix_result_pose, fix_result_original], + ) + fix_clear.click( + fix_clear_all, [], [ - ref, - ref_manual_checkbox, - ref_manual_kp_right, - ref_manual_kp_left, - ref_img, - ref_pose, - ref_manual_pose, - ref_cond, - ref_flip, - target, - target_keypts, - target_manual_checkbox, - target_manual_kp_right, - target_manual_kp_left, - target_img, - target_pose, - target_manual_pose, - target_cond, - target_flip, - results, - results_pose, - n_generation, - seed, - cfg, - ref_kp_raw, + fix_crop, + fix_crop_coord, + fix_ref, + fix_checkbox, + fix_kp_right, + fix_kp_left, + fix_result, + fix_result_pose, + fix_result_original, + fix_inpaint_mask, + fix_original, + fix_img, + fix_vis_mask32, + fix_vis_mask256, + fix_kpts, + fix_kpts_np, + fix_ref_cond, + fix_target_cond, + fix_latent, + fix_inpaint_latent, + fix_n_generation, + fix_seed, + fix_cfg, + fix_quality, ], ) - clear.click( - set_unvisible, + fix_clear.click( + fix_set_unvisible, [], [ - ref_manual_kp_l_info, - ref_manual_kp_r_info, - ref_manual_kp_left, - ref_manual_kp_right, - ref_manual_undo_left, - ref_manual_undo_right, - ref_manual_reset_left, - ref_manual_reset_right, - ref_manual_done, - ref_manual_done_info, - ref_manual_pose, - ref_use_manual, - target_manual_kp_l_info, - target_manual_kp_r_info, - target_manual_kp_left, - target_manual_kp_right, - target_manual_undo_left, - target_manual_undo_right, - target_manual_reset_left, - target_manual_reset_right, - target_manual_done, - target_manual_done_info, - target_manual_pose, - target_use_manual, + fix_kp_right, + fix_kp_left, + fix_kp_r_info, + fix_kp_l_info, + fix_undo_left, + fix_undo_right, + fix_reset_left, + fix_reset_right ] ) - with gr.Tab("Fix Hands"): - fix_inpaint_mask = gr.State(value=None) - fix_original = gr.State(value=None) - fix_crop_coord = gr.State(value=None) - fix_img = gr.State(value=None) - fix_kpts = gr.State(value=None) - fix_kpts_np = gr.State(value=None) - fix_ref_cond = gr.State(value=None) - fix_target_cond = gr.State(value=None) - fix_latent = gr.State(value=None) - fix_inpaint_latent = gr.State(value=None) + with gr.Tab("Demo 2. Repose Hands", elem_id="repose-tab"): + # gr.Markdown("""Demo 2. Repose Hands
""") + dump = gr.State(value=None) + + # ref states + ref_img = gr.State(value=None) + ref_im_raw = gr.State(value=None) + ref_kp_raw = gr.State(value=0) + ref_is_user = gr.State(value=True) + ref_kp_got = gr.State(value=None) + ref_manual_cond = gr.State(value=None) + ref_auto_cond = gr.State(value=None) + ref_cond = gr.State(value=None) + + # target states + target_img = gr.State(value=None) + target_im_raw = gr.State(value=None) + target_kp_raw = gr.State(value=0) + target_is_user = gr.State(value=True) + target_kp_got = gr.State(value=None) + target_manual_keypts = gr.State(value=None) + target_auto_keypts = gr.State(value=None) + target_keypts = gr.State(value=None) + target_manual_cond = gr.State(value=None) + target_auto_cond = gr.State(value=None) + target_cond = gr.State(value=None) + + # main tab with gr.Row(): - # crop & brush + # ref column with gr.Column(): gr.Markdown( - """1. Upload a malformed hand image to fix π₯
""" + """1. Upload a hand image to repose π₯
""" ) gr.Markdown( - """① Optionally crop the image by clicking top left and bottom right of your desired bounding box around the hand.
""" + """Optionally crop the image
""" ) - # fix_crop = gr.ImageEditor( - # type="numpy", - # sources=["upload", "webcam", "clipboard"], - # label="Image crop", - # show_label=True, - # height=LENGTH, - # width=LENGTH, - # layers=False, - # # crop_size="1:1", - # transforms=(), - # brush=False, - # image_mode="RGBA", - # container=False, - # ) - fix_crop = gr.Image( + ref = gr.ImageEditor( type="numpy", - sources=["upload", "webcam", "clipboard"], - label="Input Image", + label="Reference", show_label=True, height=LENGTH, width=LENGTH, - interactive=True, - visible=True, - ) - gr.Markdown( - """π‘ If you crop, the model can focus on more details of the cropped area. Square crops might work better than rectangle crops.
""" + brush=False, + layers=False, + crop_size="1:1", ) - # fix_tmp = gr.Image( - # type="numpy", - # label="tmp", - # show_label=True, - # height=LENGTH, - # width=LENGTH, - # interactive=True, - # visible=True, - # sources=[], + gr.Examples(example_ref_imgs, [ref], examples_per_page=20) + # gr.Markdown( + # """② Hit the "Finish Cropping" button to get hand pose
""" # ) - fix_example = gr.Examples( - fix_example_imgs, - inputs=[fix_crop], - examples_per_page=20, + # ref_finish_crop = gr.Button(value="Finish Cropping", interactive=False) + with gr.Accordion(label="See hand pose and more options", open=False): + with gr.Tab("Automatic hand keypoints"): + ref_pose = gr.Image( + type="numpy", + label="Reference Pose", + show_label=True, + height=LENGTH, + width=LENGTH, + interactive=False, + ) + ref_use_auto = gr.Button(value="Click here to use automatic, not manual", interactive=False, visible=True) + with gr.Tab("Manual hand keypoints"): + ref_manual_checkbox_info = gr.Markdown( + """Step 1. Tell us if this is right, left, or both hands.
""", + visible=True, + ) + ref_manual_checkbox = gr.CheckboxGroup( + ["Right hand", "Left hand"], + show_label=False, + visible=True, + interactive=True, + ) + ref_manual_kp_r_info = gr.Markdown( + """Step 2. Click on image to provide hand keypoints for right hand. See \"OpenPose Keypoint Convention\" for guidance.
""", + visible=False, + ) + ref_manual_kp_right = gr.Image( + type="numpy", + label="Keypoint Selection (right hand)", + show_label=True, + height=LENGTH, + width=LENGTH, + interactive=False, + visible=False, + sources=[], + ) + with gr.Row(): + ref_manual_undo_right = gr.Button( + value="Undo", interactive=True, visible=False + ) + ref_manual_reset_right = gr.Button( + value="Reset", interactive=True, visible=False + ) + ref_manual_kp_l_info = gr.Markdown( + """Step 2. Click on image to provide hand keypoints for left hand. See \"OpenPose keypoint convention\" for guidance.
""", + visible=False + ) + ref_manual_kp_left = gr.Image( + type="numpy", + label="Keypoint Selection (left hand)", + show_label=True, + height=LENGTH, + width=LENGTH, + interactive=False, + visible=False, + sources=[], + ) + with gr.Row(): + ref_manual_undo_left = gr.Button( + value="Undo", interactive=True, visible=False + ) + ref_manual_reset_left = gr.Button( + value="Reset", interactive=True, visible=False + ) + ref_manual_done_info = gr.Markdown( + """Step 3. Hit \"Done\" button to confirm.
""", + visible=False, + ) + ref_manual_done = gr.Button(value="Done", interactive=True, visible=False) + ref_manual_pose = gr.Image( + type="numpy", + label="Reference Pose", + show_label=True, + height=LENGTH, + width=LENGTH, + interactive=False, + visible=False + ) + ref_use_manual = gr.Button(value="Click here to use manual, not automatic", interactive=True, visible=False) + ref_manual_instruct = gr.Markdown( + value="""OpenPose Keypoints Convention
""", + visible=True + ) + ref_manual_openpose = gr.Image( + value="openpose.png", + type="numpy", + show_label=False, + height=LENGTH // 2, + width=LENGTH // 2, + interactive=False, + visible=True + ) + gr.Markdown( + """Optionally flip the hand
""" + ) + ref_flip = gr.Checkbox( + value=False, label="Flip Handedness (Reference)", interactive=False + ) + + # target column + with gr.Column(): + gr.Markdown( + """2. Upload a hand image for target hand pose π₯
""" ) gr.Markdown( - """② Brush area (e.g., wrong finger) that needs to be fixed. Don't brush the entire hand!
""" + """Optionally crop the image
""" ) - fix_ref = gr.ImageEditor( + target = gr.ImageEditor( type="numpy", - label="Image Brushing", - sources=(), + label="Target", show_label=True, height=LENGTH, width=LENGTH, + brush=False, layers=False, - transforms=("brush"), - brush=gr.Brush( - colors=["rgb(255, 255, 255)"], default_size=20 - ), # 204, 50, 50 - image_mode="RGBA", - container=False, - interactive=False, - ) - gr.Markdown( - """③ Hit the \"Finish Cropping & Brushing\" button
""" - ) - fix_finish_crop = gr.Button( - value="Finish Croping & Brushing", interactive=False + crop_size="1:1", ) - - # keypoint selection + gr.Examples(example_target_imgs, [target], examples_per_page=20) + # gr.Markdown( + # """② Hit the "Finish Cropping" button to get hand pose
""" + # ) + # target_finish_crop = gr.Button( + # value="Finish Cropping", interactive=False + # ) + with gr.Accordion(label="See hand pose and more options", open=False): + with gr.Tab("Automatic hand keypoints"): + target_pose = gr.Image( + type="numpy", + label="Target Pose", + show_label=True, + height=LENGTH, + width=LENGTH, + interactive=False, + ) + target_use_auto = gr.Button(value="Click here to use automatic, not manual", interactive=False, visible=True) + with gr.Tab("Manual hand keypoints"): + target_manual_checkbox_info = gr.Markdown( + """Step 1. Tell us if this is right, left, or both hands.
""", + visible=True, + ) + target_manual_checkbox = gr.CheckboxGroup( + ["Right hand", "Left hand"], + show_label=False, + visible=True, + interactive=True, + ) + target_manual_kp_r_info = gr.Markdown( + """Step 2. Click on image to provide hand keypoints for right hand. See \"OpenPose Keypoint Convention\" for guidance.
""", + visible=False, + ) + target_manual_kp_right = gr.Image( + type="numpy", + label="Keypoint Selection (right hand)", + show_label=True, + height=LENGTH, + width=LENGTH, + interactive=False, + visible=False, + sources=[], + ) + with gr.Row(): + target_manual_undo_right = gr.Button( + value="Undo", interactive=True, visible=False + ) + target_manual_reset_right = gr.Button( + value="Reset", interactive=True, visible=False + ) + target_manual_kp_l_info = gr.Markdown( + """Step 2. Click on image to provide hand keypoints for left hand. See \"OpenPose keypoint convention\" for guidance.
""", + visible=False + ) + target_manual_kp_left = gr.Image( + type="numpy", + label="Keypoint Selection (left hand)", + show_label=True, + height=LENGTH, + width=LENGTH, + interactive=False, + visible=False, + sources=[], + ) + with gr.Row(): + target_manual_undo_left = gr.Button( + value="Undo", interactive=True, visible=False + ) + target_manual_reset_left = gr.Button( + value="Reset", interactive=True, visible=False + ) + target_manual_done_info = gr.Markdown( + """Step 3. Hit \"Done\" button to confirm.
""", + visible=False, + ) + target_manual_done = gr.Button(value="Done", interactive=True, visible=False) + target_manual_pose = gr.Image( + type="numpy", + label="Target Pose", + show_label=True, + height=LENGTH, + width=LENGTH, + interactive=False, + visible=False + ) + target_use_manual = gr.Button(value="Click here to use manual, not automatic", interactive=True, visible=False) + target_manual_instruct = gr.Markdown( + value="""OpenPose Keypoints Convention
""", + visible=True + ) + target_manual_openpose = gr.Image( + value="openpose.png", + type="numpy", + show_label=False, + height=LENGTH // 2, + width=LENGTH // 2, + interactive=False, + visible=True + ) + gr.Markdown( + """Optionally flip the hand
""" + ) + target_flip = gr.Checkbox( + value=False, label="Flip Handedness (Target)", interactive=False + ) + + # result column with gr.Column(): gr.Markdown( - """2. Click on hand to get target hand pose
""" + """3. Press "Run" to get the reposed results π―
""" ) + run = gr.Button(value="Run", interactive=False) gr.Markdown( - """① Tell us if this is right, left, or both hands
""" + """β οΈ ~20s per generation with RTX3090. ~50s with A100.
(For example, if you set Number of generations as 2, it would take around 40s)
② Click 21 keypoints on the image to provide the target hand pose of right hand. See the \"OpenPose keypoints convention\" for guidance.
""", - visible=False - ) - # fix_kp_r_info = gr.Markdown( - # """Select right only
""", - # visible=False, - # ) - fix_kp_right = gr.Image( - type="numpy", - label="Keypoint Selection (right hand)", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - visible=False, - sources=[], - ) - with gr.Row(): - fix_undo_right = gr.Button( - value="Undo", interactive=False, visible=False - ) - fix_reset_right = gr.Button( - value="Reset", interactive=False, visible=False - ) - fix_kp_l_info = gr.Markdown( - """② Click 21 keypoints on the image to provide the target hand pose of left hand. See the \"OpenPose keypoints convention\" for guidance.
""", - visible=False - ) - fix_kp_left = gr.Image( - type="numpy", - label="Keypoint Selection (left hand)", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - visible=False, - sources=[], - ) - with gr.Row(): - fix_undo_left = gr.Button( - value="Undo", interactive=False, visible=False - ) - fix_reset_left = gr.Button( - value="Reset", interactive=False, visible=False - ) - gr.Markdown( - """OpenPose keypoints convention
""" - ) - fix_openpose = gr.Image( - value="openpose.png", - type="numpy", - show_label=False, - height=LENGTH // 2, - width=LENGTH // 2, - interactive=False, - ) - - # get latent - with gr.Column(): - gr.Markdown( - """3. Press "Ready" to start pre-processing
""" - ) - fix_ready = gr.Button(value="Ready", interactive=False) - gr.Markdown( - """Visualized (256, 256)-resized, brushed image
""" - ) - fix_vis_mask32 = gr.Image( - type="numpy", - label=f"Visualized {opts.latent_size} Inpaint Mask", - show_label=True, - height=opts.latent_size, - width=opts.latent_size, - interactive=False, - visible=False, - ) - fix_vis_mask256 = gr.Image( - type="numpy", - visible=True, - show_label=False, - height=opts.image_size, - width=opts.image_size, - interactive=False, - ) - # gr.Markdown( - # """[NOTE] Above should be inpaint mask that you brushed, NOT the segmentation mask of the entire hand.
""" - # ) - - # result column - with gr.Column(): - gr.Markdown( - """4. Press "Run" to get the fixed hand image π―
""" - ) - fix_run = gr.Button(value="Run", interactive=False) - gr.Markdown( - """β οΈ >3min and ~24GB per generation
""" - ) - fix_result_original = gr.Gallery( - type="numpy", - label="Results on original input", - show_label=True, - height=LENGTH, - min_width=LENGTH, - columns=FIX_MAX_N, - interactive=False, - preview=True, - ) - fix_result = gr.Gallery( + results = gr.Gallery( type="numpy", label="Results", show_label=True, height=LENGTH, min_width=LENGTH, - columns=FIX_MAX_N, - interactive=False, - preview=True, - ) - fix_result_pose = gr.Gallery( - type="numpy", - label="Results Pose", - show_label=True, - height=LENGTH, - min_width=LENGTH, - columns=FIX_MAX_N, + columns=MAX_N, interactive=False, preview=True, ) + with gr.Accordion(label="Results with pose", open=False): + results_pose = gr.Gallery( + type="numpy", + label="Results Pose", + show_label=True, + height=LENGTH, + min_width=LENGTH, + columns=MAX_N, + interactive=False, + preview=True, + ) gr.Markdown( """β¨ Hit "Clear" to restart from the beginning
""" ) - fix_clear = gr.ClearButton() + clear = gr.ClearButton() - with gr.Tab("More options"): - gr.Markdown( - "β οΈ Currently, Number of generation > 1 could lead to out-of-memory" - ) + with gr.Accordion(label="More options", open=False): with gr.Row(): - fix_n_generation = gr.Slider( + n_generation = gr.Slider( label="Number of generations", value=1, minimum=1, - maximum=FIX_MAX_N, + maximum=MAX_N, step=1, randomize=False, interactive=True, ) - fix_seed = gr.Slider( + seed = gr.Slider( label="Seed", value=42, minimum=0, @@ -1992,174 +2003,224 @@ with gr.Blocks(css=custom_css, theme="soft") as demo: randomize=False, interactive=True, ) - fix_cfg = gr.Slider( + cfg = gr.Slider( label="Classifier free guidance scale", - value=3.0, + value=2.5, minimum=0.0, maximum=10.0, step=0.1, randomize=False, interactive=True, ) - fix_quality = gr.Slider( - label="Quality", - value=10, - minimum=1, - maximum=10, - step=1, - randomize=False, - interactive=True, - ) + + # reference listeners + # ref.change(enable_component, [ref, ref], ref_finish_crop) + ref.change(prepare_anno, [ref, ref_is_user], [ref_im_raw, ref_kp_raw]) + ref_kp_raw.change(lambda x: x, ref_im_raw, ref_manual_kp_right) + ref_kp_raw.change(lambda x: x, ref_im_raw, ref_manual_kp_left) + ref_kp_raw.change(get_ref_anno, [ref_im_raw, ref_kp_raw], [ref_img, ref_pose, ref_auto_cond, ref, ref_is_user]) + ref_pose.change(enable_component, [ref_kp_raw, ref_pose], ref_use_auto) + ref_pose.change(enable_component, [ref_img, ref_pose], ref_flip) + ref_auto_cond.change(lambda x: x, ref_auto_cond, ref_cond) + ref_use_auto.click(lambda x: x, ref_auto_cond, ref_cond) + ref_use_auto.click(lambda x: gr.Info("Automatic hand keypoints will be used for 'Reference'", duration=3)) - # listeners - # fix_crop.change(resize_to_full, fix_crop, fix_ref) - fix_crop.change(lambda x: x, fix_crop, fix_original) # fix_original: (real_H, real_W, 3) - fix_crop.change(stay_crop, [fix_crop, fix_crop_coord], [fix_crop_coord, fix_ref]) - fix_crop.select(process_crop, [fix_crop, fix_crop_coord], [fix_crop_coord, fix_ref]) - # fix_ref.change(disable_crop, fix_crop_coord, fix_crop) - fix_ref.change(enable_component, [fix_crop, fix_crop], fix_ref) - fix_ref.change(enable_component, [fix_crop, fix_crop], fix_finish_crop) - fix_finish_crop.click(visualize_ref, [fix_ref], [fix_img]) - fix_finish_crop.click(get_mask_inpaint, [fix_ref], [fix_inpaint_mask]) # fix_ref: (real_cropped_H, real_cropped_W, 3) - fix_img.change(lambda x: x, [fix_img], [fix_kp_right]) - fix_img.change(lambda x: x, [fix_img], [fix_kp_left]) - fix_inpaint_mask.change( - enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_checkbox + ref_manual_checkbox.select( + set_visible, + [ref_manual_checkbox, ref_kp_got, ref_im_raw, ref_manual_kp_right, ref_manual_kp_left, ref_manual_done], + [ + ref_kp_got, + ref_manual_kp_right, + ref_manual_kp_left, + ref_manual_kp_right, + ref_manual_undo_right, + ref_manual_reset_right, + ref_manual_kp_left, + ref_manual_undo_left, + ref_manual_reset_left, + ref_manual_kp_r_info, + ref_manual_kp_l_info, + ref_manual_done, + ref_manual_done_info + ] ) - fix_inpaint_mask.change( - enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_kp_right + ref_manual_kp_right.select( + get_kps, [ref_im_raw, ref_kp_got, gr.State("right")], [ref_manual_kp_right, ref_kp_got] ) - fix_inpaint_mask.change( - enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_undo_right + ref_manual_undo_right.click( + undo_kps, [ref_im_raw, ref_kp_got, gr.State("right")], [ref_manual_kp_right, ref_kp_got] ) - fix_inpaint_mask.change( - enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_reset_right + ref_manual_reset_right.click( + reset_kps, [ref_im_raw, ref_kp_got, gr.State("right")], [ref_manual_kp_right, ref_kp_got] ) - fix_inpaint_mask.change( - enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_kp_left + ref_manual_kp_left.select( + get_kps, [ref_im_raw, ref_kp_got, gr.State("left")], [ref_manual_kp_left, ref_kp_got] ) - fix_inpaint_mask.change( - enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_undo_left + ref_manual_undo_left.click( + undo_kps, [ref_im_raw, ref_kp_got, gr.State("left")], [ref_manual_kp_left, ref_kp_got] ) - fix_inpaint_mask.change( - enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_reset_left + ref_manual_reset_left.click( + reset_kps, [ref_im_raw, ref_kp_got, gr.State("left")], [ref_manual_kp_left, ref_kp_got] ) - fix_inpaint_mask.change( - enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_ready + ref_manual_done.click(visible_component, [gr.State(0), ref_manual_pose], ref_manual_pose) + ref_manual_done.click(visible_component, [gr.State(0), ref_use_manual], ref_use_manual) + ref_manual_done.click(get_ref_anno, [ref_im_raw, ref_kp_got], [ref_img, ref_manual_pose, ref_manual_cond]) + ref_manual_pose.change(enable_component, [ref_manual_pose, ref_manual_pose], ref_manual_done) + ref_manual_pose.change(enable_component, [ref_img, ref_manual_pose], ref_flip) + ref_manual_cond.change(lambda x: x, ref_manual_cond, ref_cond) + ref_use_manual.click(lambda x: x, ref_manual_cond, ref_cond) + ref_use_manual.click(lambda x: gr.Info("Manual hand keypoints will be used for 'Reference'", duration=3)) + + ref_flip.select( + flip_hand, + [ref, ref_im_raw, ref_pose, ref_manual_pose, ref_manual_kp_right, ref_manual_kp_left, ref_cond, ref_auto_cond, ref_manual_cond], + [ref, ref_im_raw, ref_pose, ref_manual_pose, ref_manual_kp_right, ref_manual_kp_left, ref_cond, ref_auto_cond, ref_manual_cond] ) - fix_checkbox.select( + + # target listeners + # target.change(enable_component, [target, target], target_finish_crop) + target.change(prepare_anno, [target, target_is_user], [target_im_raw, target_kp_raw]) + target_kp_raw.change(lambda x:x, target_im_raw, target_manual_kp_right) + target_kp_raw.change(lambda x:x, target_im_raw, target_manual_kp_left) + target_kp_raw.change(get_target_anno, [target_im_raw, target_kp_raw], [target_img, target_pose, target_auto_cond, target_auto_keypts, target, target_is_user]) + target_pose.change(enable_component, [target_kp_raw, target_pose], target_use_auto) + target_pose.change(enable_component, [target_img, target_pose], target_flip) + target_auto_cond.change(lambda x: x, target_auto_cond, target_cond) + target_auto_keypts.change(lambda x: x, target_auto_keypts, target_keypts) + target_use_auto.click(lambda x: x, target_auto_cond, target_cond) + target_use_auto.click(lambda x: x, target_auto_keypts, target_keypts) + target_use_auto.click(lambda x: gr.Info("Automatic hand keypoints will be used for 'Target'", duration=3)) + + target_manual_checkbox.select( set_visible, - [fix_checkbox, fix_kpts, fix_img, fix_kp_right, fix_kp_left], + [target_manual_checkbox, target_kp_got, target_im_raw, target_manual_kp_right, target_manual_kp_left, target_manual_done], [ - fix_kpts, - fix_kp_right, - fix_kp_left, - fix_kp_right, - fix_undo_right, - fix_reset_right, - fix_kp_left, - fix_undo_left, - fix_reset_left, - fix_kp_r_info, - fix_kp_l_info, - ], - ) - fix_kp_right.select( - get_kps, [fix_img, fix_kpts, gr.State("right")], [fix_kp_right, fix_kpts] # fix_img: (real_cropped_H, real_cropped_W, 3) + target_kp_got, + target_manual_kp_right, + target_manual_kp_left, + target_manual_kp_right, + target_manual_undo_right, + target_manual_reset_right, + target_manual_kp_left, + target_manual_undo_left, + target_manual_reset_left, + target_manual_kp_r_info, + target_manual_kp_l_info, + target_manual_done, + target_manual_done_info + ] ) - fix_undo_right.click( - undo_kps, [fix_img, fix_kpts, gr.State("right")], [fix_kp_right, fix_kpts] + target_manual_kp_right.select( + get_kps, [target_im_raw, target_kp_got, gr.State("right")], [target_manual_kp_right, target_kp_got] ) - fix_reset_right.click( - reset_kps, [fix_img, fix_kpts, gr.State("right")], [fix_kp_right, fix_kpts] + target_manual_undo_right.click( + undo_kps, [target_im_raw, target_kp_got, gr.State("right")], [target_manual_kp_right, target_kp_got] ) - fix_kp_left.select( - get_kps, [fix_img, fix_kpts, gr.State("left")], [fix_kp_left, fix_kpts] + target_manual_reset_right.click( + reset_kps, [target_im_raw, target_kp_got, gr.State("right")], [target_manual_kp_right, target_kp_got] ) - fix_undo_left.click( - undo_kps, [fix_img, fix_kpts, gr.State("left")], [fix_kp_left, fix_kpts] + target_manual_kp_left.select( + get_kps, [target_im_raw, target_kp_got, gr.State("left")], [target_manual_kp_left, target_kp_got] ) - fix_reset_left.click( - reset_kps, [fix_img, fix_kpts, gr.State("left")], [fix_kp_left, fix_kpts] + target_manual_undo_left.click( + undo_kps, [target_im_raw, target_kp_got, gr.State("left")], [target_manual_kp_left, target_kp_got] ) - fix_vis_mask32.change( - enable_component, [fix_vis_mask32, fix_vis_mask256], fix_run + target_manual_reset_left.click( + reset_kps, [target_im_raw, target_kp_got, gr.State("left")], [target_manual_kp_left, target_kp_got] ) - fix_ready.click( - ready_sample, - [fix_ref, fix_inpaint_mask, fix_kpts], - [ - fix_ref_cond, - fix_target_cond, - fix_latent, - fix_inpaint_latent, - fix_kpts_np, - fix_vis_mask32, - fix_vis_mask256, - ], + target_manual_done.click(visible_component, [gr.State(0), target_manual_pose], target_manual_pose) + target_manual_done.click(visible_component, [gr.State(0), target_use_manual], target_use_manual) + target_manual_done.click(get_target_anno, [target_im_raw, target_kp_got], [target_img, target_manual_pose, target_manual_cond, target_manual_keypts]) + target_manual_pose.change(enable_component, [target_manual_pose, target_manual_pose], target_manual_done) + target_manual_pose.change(enable_component, [target_img, target_manual_pose], target_flip) + target_manual_cond.change(lambda x: x, target_manual_cond, target_cond) + target_manual_keypts.change(lambda x: x, target_manual_keypts, target_keypts) + target_use_manual.click(lambda x: x, target_manual_cond, target_cond) + target_use_manual.click(lambda x: x, target_manual_keypts, target_keypts) + target_use_manual.click(lambda x: gr.Info("Manual hand keypoints will be used for 'Reference'", duration=3)) + + target_flip.select( + flip_hand, + [target, target_im_raw, target_pose, target_manual_pose, target_manual_kp_right, target_manual_kp_left, target_cond, target_auto_cond, target_manual_cond, target_keypts, target_auto_keypts, target_manual_keypts], + [target, target_im_raw, target_pose, target_manual_pose, target_manual_kp_right, target_manual_kp_left, target_cond, target_auto_cond, target_manual_cond, target_keypts, target_auto_keypts, target_manual_keypts], ) - fix_run.click( - sample_inpaint, - [ - fix_ref_cond, - fix_target_cond, - fix_latent, - fix_inpaint_latent, - fix_kpts_np, - fix_original, - fix_crop_coord, - fix_n_generation, - fix_seed, - fix_cfg, - fix_quality, - ], - [fix_result, fix_result_pose, fix_result_original], + + # run listerners + ref_cond.change(enable_component, [ref_cond, target_cond], run) + target_cond.change(enable_component, [ref_cond, target_cond], run) + # ref_manual_pose.change(enable_component, [ref_manual_pose, target_manual_pose], run) + # target_manual_pose.change(enable_component, [ref_manual_pose, target_manual_pose], run) + run.click( + sample_diff, + [ref_cond, target_cond, target_keypts, n_generation, seed, cfg], + [results, results_pose], ) - fix_clear.click( - fix_clear_all, + clear.click( + clear_all, [], [ - fix_crop, - fix_crop_coord, - fix_ref, - fix_checkbox, - fix_kp_right, - fix_kp_left, - fix_result, - fix_result_pose, - fix_result_original, - fix_inpaint_mask, - fix_original, - fix_img, - fix_vis_mask32, - fix_vis_mask256, - fix_kpts, - fix_kpts_np, - fix_ref_cond, - fix_target_cond, - fix_latent, - fix_inpaint_latent, - fix_n_generation, - fix_seed, - fix_cfg, - fix_quality, + ref, + ref_manual_checkbox, + ref_manual_kp_right, + ref_manual_kp_left, + ref_img, + ref_pose, + ref_manual_pose, + ref_cond, + ref_flip, + target, + target_keypts, + target_manual_checkbox, + target_manual_kp_right, + target_manual_kp_left, + target_img, + target_pose, + target_manual_pose, + target_cond, + target_flip, + results, + results_pose, + n_generation, + seed, + cfg, + ref_kp_raw, ], ) - fix_clear.click( - fix_set_unvisible, + clear.click( + set_unvisible, [], [ - fix_kp_right, - fix_kp_left, - fix_kp_r_info, - fix_kp_l_info, - fix_undo_left, - fix_undo_right, - fix_reset_left, - fix_reset_right + ref_manual_kp_l_info, + ref_manual_kp_r_info, + ref_manual_kp_left, + ref_manual_kp_right, + ref_manual_undo_left, + ref_manual_undo_right, + ref_manual_reset_left, + ref_manual_reset_right, + ref_manual_done, + ref_manual_done_info, + ref_manual_pose, + ref_use_manual, + target_manual_kp_l_info, + target_manual_kp_r_info, + target_manual_kp_left, + target_manual_kp_right, + target_manual_undo_left, + target_manual_undo_right, + target_manual_reset_left, + target_manual_reset_right, + target_manual_done, + target_manual_done_info, + target_manual_pose, + target_use_manual, ] ) + gr.Markdown("If this was useful, please cite us! β€οΈ
"""