diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -27,6 +27,7 @@ except: MAX_N = 6 FIX_MAX_N = 6 +LENGTH = 480 placeholder = cv2.cvtColor(cv2.imread("placeholder.png"), cv2.COLOR_BGR2RGB) NEW_MODEL = True @@ -200,16 +201,18 @@ if NEW_MODEL: if MODEL_EPOCH == 7: model_path = './DINO_EMA_11M_b50_lr1e-5_epoch7_step380k.ckpt' elif MODEL_EPOCH == 6: - # model_path = "./DINO_EMA_11M_b50_lr1e-5_epoch6_step320k.ckpt" - model_path = hf_hub_download(repo_id="Chaerin5/FoundHand-weights", filename="DINO_EMA_11M_b50_lr1e-5_epoch6_step320k.ckpt", token=token) + model_path = "./DINO_EMA_11M_b50_lr1e-5_epoch6_step320k.ckpt" + if not os.path.exists(model_path): + model_path = hf_hub_download(repo_id="Chaerin5/FoundHand-weights", filename="DINO_EMA_11M_b50_lr1e-5_epoch6_step320k.ckpt", token=token) elif MODEL_EPOCH == 4: model_path = "./DINO_EMA_11M_b50_lr1e-5_epoch4_step210k.ckpt" elif MODEL_EPOCH == 10: model_path = "./DINO_EMA_11M_b50_lr1e-5_epoch10_step550k.ckpt" else: raise ValueError(f"new model epoch should be either 6 or 7, got {MODEL_EPOCH}") - # vae_path = './vae-ft-mse-840000-ema-pruned.ckpt' - vae_path = hf_hub_download(repo_id="Chaerin5/FoundHand-weights", filename="vae-ft-mse-840000-ema-pruned.ckpt", token=token) + vae_path = './vae-ft-mse-840000-ema-pruned.ckpt' + if not os.path.exists(vae_path): + vae_path = hf_hub_download(repo_id="Chaerin5/FoundHand-weights", filename="vae-ft-mse-840000-ema-pruned.ckpt", token=token) # sd_path = './sd-v1-4.ckpt' print('Load diffusion model...') diffusion = create_diffusion(str(opts.test_sampling_steps)) @@ -242,7 +245,9 @@ if NEW_MODEL: print(f"autoencoder encoder after eval() dtype: {next(autoencoder.encoder.parameters()).dtype}") assert len(missing_keys) == 0 -sam_path = hf_hub_download(repo_id="Chaerin5/FoundHand-weights", filename="sam_vit_h_4b8939.pth", token=token) +sam_path = "sam_vit_h_4b8939.pth" +if not os.path.exists(sam_path): + sam_path = hf_hub_download(repo_id="Chaerin5/FoundHand-weights", filename="sam_vit_h_4b8939.pth", token=token) sam_predictor = init_sam(ckpt_path=sam_path, device=pre_device) print("Mediapipe hand detector and SAM ready...") @@ -252,12 +257,15 @@ hands = mp_hands.Hands( max_num_hands=2, # Maximum number of hands to detect min_detection_confidence=0.1, ) +no_hands_open = cv2.resize(np.array(Image.open("no_hands_open.jpeg"))[..., :3], (LENGTH, LENGTH)) -def prepare_anno(ref): - if ref is None: +def prepare_anno(ref, ref_is_user): + if not ref_is_user: # no_hand_open.jpeg + return gr.update(value=None), gr.update(value=None) + if ref is None or ref["background"] is None or ref["background"].sum()==0: # clear_all return ( - gr.Image.update(value=None), - gr.State.update(value=None), + gr.update(value=None), + gr.update(value=None), ) img = ref["composite"][..., :3] img = cv2.resize(img, opts.image_size, interpolation=cv2.INTER_AREA) @@ -286,9 +294,11 @@ def prepare_anno(ref): return img, None def get_ref_anno(img, keypts): - if keypts is None: + if img.sum() == 0: # clear_all + return None, gr.update(), None, gr.update(), True + elif keypts is None: # hand not detected no_hands = cv2.resize(np.array(Image.open("no_hands.png"))[..., :3], (LENGTH, LENGTH)) - return None, no_hands, None + return None, no_hands, None, no_hands_open, False missing_keys, extra_keys = autoencoder.load_state_dict(vae_state_dict, strict=False) if isinstance(keypts, list): if len(keypts[0]) == 0: @@ -297,29 +307,40 @@ def get_ref_anno(img, keypts): keypts[0] = np.array(keypts[0], dtype=np.float32) else: gr.Info("Number of right hand keypoints should be either 0 or 21.") - return None, None, None + return None, None, None, gr.update(), gr.update() if len(keypts[1]) == 0: keypts[1] = np.zeros((21, 2)) elif len(keypts[1]) == 21: keypts[1] = np.array(keypts[1], dtype=np.float32) else: gr.Info("Number of left hand keypoints should be either 0 or 21.") - return None, None, None + return None, None, None, gr.update(), gr.update() keypts = np.concatenate(keypts, axis=0) if REF_POSE_MASK: sam_predictor.set_image(img) if keypts[0].sum() != 0 and keypts[21].sum() != 0: - input_point = np.array([keypts[0], keypts[21]]) - input_label = np.array([1, 1]) + # input_point = np.array([keypts[0], keypts[21]]) + input_point = np.array(keypts) + input_box = np.stack([keypts.min(axis=0), keypts.max(axis=0)]) + # input_label = np.array([1, 1]) elif keypts[0].sum() != 0: - input_point = np.array(keypts[:1]) - input_label = np.array([1]) + # input_point = np.array(keypts[:1]) + input_point = np.array(keypts[:21]) + input_box = np.stack([keypts[:21].min(axis=0), keypts[:21].max(axis=0)]) + # input_label = np.array([1]) elif keypts[21].sum() != 0: - input_point = np.array(keypts[21:22]) - input_label = np.array([1]) + input_point = np.array(keypts[21:]) + # input_label = np.array([1]) + input_box = np.stack([keypts[21:].min(axis=0), keypts[21:].max(axis=0)]) + input_label = np.ones_like(input_point[:, 0]).astype(np.int32) + box_shift_ratio = 0.5 + box_size_factor = 1.2 + box_trans = input_box[0] * box_shift_ratio + input_box[1] * (1 - box_shift_ratio) + input_box = ((input_box - box_trans) * box_size_factor + box_trans).reshape(-1) masks, _, _ = sam_predictor.predict( point_coords=input_point, point_labels=input_label, + box=input_box[None, :], multimask_output=False, ) hand_mask = masks[0] @@ -388,12 +409,14 @@ def get_ref_anno(img, keypts): ref_cond = torch.cat([latent, heatmaps, mask], 1) print(f"ref_cond.max(): {ref_cond.max()}, ref_cond.min(): {ref_cond.min()}") - return img, ref_pose, ref_cond + return img, ref_pose, ref_cond, gr.update(), True def get_target_anno(img, keypts): - if keypts is None: + if img.sum() == 0: # clear_all + return None, gr.update(), None, gr.update(), True + if keypts is None: # hands not detected no_hands = cv2.resize(np.array(Image.open("no_hands.png"))[..., :3], (LENGTH, LENGTH)) - return None, no_hands, None, None + return None, no_hands, None, None, no_hands_open, False if isinstance(keypts, list): if len(keypts[0]) == 0: keypts[0] = np.zeros((21, 2)) @@ -401,14 +424,14 @@ def get_target_anno(img, keypts): keypts[0] = np.array(keypts[0], dtype=np.float32) else: gr.Info("Number of right hand keypoints should be either 0 or 21.") - return None, None, None + return None, None, None, gr.update(), gr.update(), gr.update() if len(keypts[1]) == 0: keypts[1] = np.zeros((21, 2)) elif len(keypts[1]) == 21: keypts[1] = np.array(keypts[1], dtype=np.float32) else: gr.Info("Number of left hand keypoints should be either 0 or 21.") - return None, None, None + return None, None, None, gr.update(), gr.update(), gr.update() keypts = np.concatenate(keypts, axis=0) target_pose = visualize_hand(keypts, img) kpts_valid = check_keypoints_validity(keypts, opts.image_size) @@ -426,26 +449,29 @@ def get_target_anno(img, keypts): [target_heatmaps, torch.zeros_like(target_heatmaps)[:, :1]], 1 ) - return img, target_pose, target_cond, keypts + return img, target_pose, target_cond, keypts, gr.update(), True +# def get_mask_inpaint(ref): +# # inpaint_mask = np.zeros_like(img_original[:, :, 0]) +# # cropped_mask = np.array(ref["layers"][0])[..., -1] +# # inpaint_mask[crop_coord[0][1]:crop_coord[1][1], crop_coord[0][0]:crop_coord[1][0]] = cropped_mask -def get_mask_inpaint(ref): - # inpaint_mask = np.zeros_like(img_original[:, :, 0]) - # cropped_mask = np.array(ref["layers"][0])[..., -1] - # inpaint_mask[crop_coord[0][1]:crop_coord[1][1], crop_coord[0][0]:crop_coord[1][0]] = cropped_mask +# return inpaint_mask + +def visualize_ref(ref): + if ref is None: + return None + + # inpaint mask inpaint_mask = np.array(ref["layers"][0])[..., -1] inpaint_mask = cv2.resize( inpaint_mask, opts.image_size, interpolation=cv2.INTER_AREA ) inpaint_mask = (inpaint_mask >= 128).astype(np.uint8) - return inpaint_mask - -def visualize_ref(brush): # crop, - if brush is None: # crop is None or - return None - inpainted = brush["layers"][0][..., -1] - img = brush["background"][..., :3] + # viualization + inpainted = ref["layers"][0][..., -1] + img = ref["background"][..., :3] # img = cv2.resize(img, inpainted.shape[::-1], interpolation=cv2.INTER_AREA) mask = inpainted < 128 # img = img.astype(np.int32) @@ -453,7 +479,7 @@ def visualize_ref(brush): # crop, # img[np.any(img<0, axis=-1)]=0 # img = img.astype(np.uint8) img = mask_image(img, mask) - return img + return img, inpaint_mask def get_kps(img, keypoints, side: Literal["right", "left"], evt: gr.SelectData): @@ -745,6 +771,8 @@ def sample_inpaint( cfg, quality, ): + if keypts is None: + return None, None, None set_seed(seed) N = num_gen jump_length = 10 @@ -928,14 +956,14 @@ def enable_component(image1, image2): if image1 is None or image2 is None: return gr.update(interactive=False) if isinstance(image1, dict) and "background" in image1 and "layers" in image1 and "composite" in image1: - if ( + if image1["background"] is None or ( image1["background"].sum() == 0 and (sum([im.sum() for im in image1["layers"]]) == 0) and image1["composite"].sum() == 0 ): return gr.update(interactive=False) if isinstance(image1, dict) and "background" in image2 and "layers" in image2 and "composite" in image2: - if ( + if image2["background"] is None or ( image2["background"].sum() == 0 and (sum([im.sum() for im in image2["layers"]]) == 0) and image2["composite"].sum() == 0 @@ -1029,13 +1057,6 @@ def fix_set_unvisible(): gr.update(visible=False) ) -def set_no_hands(decider, component): - if decider is None: - no_hands = cv2.resize(np.array(Image.open("no_hands.png"))[..., :3], (LENGTH, LENGTH)) - return no_hands - else: - return component - def visible_component(decider, component): if decider is not None: update_component = gr.update(visible=True) @@ -1050,7 +1071,6 @@ def unvisible_component(decider, component): update_component = gr.update(visible=True) return update_component -LENGTH = 480 example_ref_imgs = [ [ @@ -1160,20 +1180,39 @@ custom_css = """ width: 240px !important; height: 240px !important; } +#fix-tab-button { + font-size: 18px !important; + font-weight: bold !important; + background-color: #FFDAB9 !important; +} +#repose-tab-button { + font-size: 18px !important; + font-weight: bold !important; + background-color: #90EE90 !important; +} """ +# color: black !important; _HEADER_ = '''

FoundHand: Large-Scale Domain-Specific Learning for Controllable Hand Image Generation

-

CVPR 2025

+

CVPR 2025 highlight

- Brown University - Meta Reality Labs + Kefan Chen1,2* + Chaerin Min1* + Linguang Zhang2 + Shreyas Hampali2 + Cem Keskin2 + Srinath Sridhar1 +

+

+ 1Brown University + 2Meta Reality Labs

Paper @@ -1181,339 +1220,283 @@ _HEADER_ = ''' Code Model Weights

-

Below are two important abilities of our model. First, we can edit hand poses given two hand images - one is the image to edit, and the other one provides target hand pose. Second, we can automatically fix malformed hand images, following the user-provided target hand pose and area to fix.

+

Below are two important abilities of our model. First, we can automatically fix malformed hand images, following the user-provided target hand pose and area to fix. Second, we can repose hand given two hand images - one is the image to edit, and the other one provides target hand pose.

''' _CITE_ = r""" -``` +
     @article{chen2024foundhand,
-    title={FoundHand: Large-Scale Domain-Specific Learning for Controllable Hand Image Generation},
-    author={Chen, Kefan and Min, Chaerin and Zhang, Linguang and Hampali, Shreyas and Keskin, Cem and Sridhar, Srinath},
-    journal={arXiv preprint arXiv:2412.02690},
-    year={2024}
+        title={FoundHand: Large-Scale Domain-Specific Learning for Controllable Hand Image Generation},
+        author={Chen, Kefan and Min, Chaerin and Zhang, Linguang and Hampali, Shreyas and Keskin, Cem and Sridhar, Srinath},
+        journal={arXiv preprint arXiv:2412.02690},
+        year={2024}
     }
-```
+
+""" +_ACK_ = r""" +
+Part of this work was done during Kefan (Arthur) Chen’s internship at Meta Reality Lab. This work was additionally supported by NSF CAREER grant #2143576, NASA grant #80NSSC23M0075, and an Amazon Cloud Credits Award.
+
""" with gr.Blocks(css=custom_css, theme="soft") as demo: gr.Markdown(_HEADER_) - with gr.Tab("Edit Hand Poses"): - dump = gr.State(value=None) - - # ref states - ref_img = gr.State(value=None) - ref_im_raw = gr.State(value=None) - ref_kp_raw = gr.State(value=0) - ref_kp_got = gr.State(value=None) - ref_manual_cond = gr.State(value=None) - ref_auto_cond = gr.State(value=None) - ref_cond = gr.State(value=None) - - # target states - target_img = gr.State(value=None) - target_im_raw = gr.State(value=None) - target_kp_raw = gr.State(value=0) - target_kp_got = gr.State(value=None) - target_manual_keypts = gr.State(value=None) - target_auto_keypts = gr.State(value=None) - target_keypts = gr.State(value=None) - target_manual_cond = gr.State(value=None) - target_auto_cond = gr.State(value=None) - target_cond = gr.State(value=None) - - # main tab + with gr.Tab("Demo 1. Malformed Hand Correction", elem_id="fix-tab"): + # gr.Markdown("""

Demo 1. Malformed Hand Correction

""") + fix_inpaint_mask = gr.State(value=None) + fix_original = gr.State(value=None) + fix_crop_coord = gr.State(value=None) + fix_img = gr.State(value=None) + fix_kpts = gr.State(value=None) + fix_kpts_np = gr.State(value=None) + fix_ref_cond = gr.State(value=None) + fix_target_cond = gr.State(value=None) + fix_latent = gr.State(value=None) + fix_inpaint_latent = gr.State(value=None) with gr.Row(): - # ref column + # crop & brush with gr.Column(): gr.Markdown( - """

1. Upload a hand image to edit πŸ“₯

""" + """

1. Upload a malformed hand image πŸ“₯

""" ) gr.Markdown( - """

① Optionally crop the image

""" + """

Optionally crop the image.
(Click top left and bottom right of your desired bounding box around the hand)

""" ) - ref = gr.ImageEditor( + # fix_crop = gr.ImageEditor( + # type="numpy", + # sources=["upload", "webcam", "clipboard"], + # label="Image crop", + # show_label=True, + # height=LENGTH, + # width=LENGTH, + # layers=False, + # # crop_size="1:1", + # transforms=(), + # brush=False, + # image_mode="RGBA", + # container=False, + # ) + fix_crop = gr.Image( type="numpy", - label="Reference", + sources=["upload", "webcam", "clipboard"], + label="Input Image", show_label=True, height=LENGTH, width=LENGTH, - brush=False, - layers=False, - crop_size="1:1", - ) - gr.Examples(example_ref_imgs, [ref], examples_per_page=20) - gr.Markdown( - """

② Hit the "Finish Cropping" button to get hand pose

""" + interactive=True, + visible=True, ) - ref_finish_crop = gr.Button(value="Finish Cropping", interactive=False) - with gr.Tab("Automatic hand keypoints"): - ref_pose = gr.Image( - type="numpy", - label="Reference Pose", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - ) - ref_use_auto = gr.Button(value="Click here to use automatic, not manual", interactive=False, visible=True) - with gr.Tab("Manual hand keypoints"): - ref_manual_checkbox_info = gr.Markdown( - """

Step 1. Tell us if this is right, left, or both hands.

""", - visible=True, - ) - ref_manual_checkbox = gr.CheckboxGroup( - ["Right hand", "Left hand"], - show_label=False, - visible=True, - interactive=True, - ) - ref_manual_kp_r_info = gr.Markdown( - """

Step 2. Click on image to provide hand keypoints for right hand. See \"OpenPose Keypoint Convention\" for guidance.

""", - visible=False, - ) - ref_manual_kp_right = gr.Image( - type="numpy", - label="Keypoint Selection (right hand)", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - visible=False, - sources=[], - ) - with gr.Row(): - ref_manual_undo_right = gr.Button( - value="Undo", interactive=True, visible=False - ) - ref_manual_reset_right = gr.Button( - value="Reset", interactive=True, visible=False - ) - ref_manual_kp_l_info = gr.Markdown( - """

Step 2. Click on image to provide hand keypoints for left hand. See \"OpenPose keypoint convention\" for guidance.

""", - visible=False - ) - ref_manual_kp_left = gr.Image( - type="numpy", - label="Keypoint Selection (left hand)", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - visible=False, - sources=[], - ) - with gr.Row(): - ref_manual_undo_left = gr.Button( - value="Undo", interactive=True, visible=False - ) - ref_manual_reset_left = gr.Button( - value="Reset", interactive=True, visible=False - ) - ref_manual_done_info = gr.Markdown( - """

Step 3. Hit \"Done\" button to confirm.

""", - visible=False, - ) - ref_manual_done = gr.Button(value="Done", interactive=True, visible=False) - ref_manual_pose = gr.Image( - type="numpy", - label="Reference Pose", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - visible=False - ) - ref_use_manual = gr.Button(value="Click here to use manual, not automatic", interactive=True, visible=False) - ref_manual_instruct = gr.Markdown( - value="""

OpenPose Keypoints Convention

""", - visible=True - ) - ref_manual_openpose = gr.Image( - value="openpose.png", - type="numpy", - show_label=False, - height=LENGTH // 2, - width=LENGTH // 2, - interactive=False, - visible=True - ) gr.Markdown( - """

③ Optionally flip the hand

""" + """

πŸ’‘ If you crop, the model can focus on more details of the cropped area. Square crops might work better than rectangle crops.

""" ) - ref_flip = gr.Checkbox( - value=False, label="Flip Handedness (Reference)", interactive=False + # fix_tmp = gr.Image( + # type="numpy", + # label="tmp", + # show_label=True, + # height=LENGTH, + # width=LENGTH, + # interactive=True, + # visible=True, + # sources=[], + # ) + fix_example = gr.Examples( + fix_example_imgs, + inputs=[fix_crop], + examples_per_page=20, ) - - # target column with gr.Column(): gr.Markdown( - """

2. Upload a hand image for target hand pose πŸ“₯

""" + """

2. Brush wrong finger and its surrounding area

""" ) gr.Markdown( - """

① Optionally crop the image

""" + """

Don't brush the entire hand!

""" ) - target = gr.ImageEditor( + fix_ref = gr.ImageEditor( type="numpy", - label="Target", + label="Image Brushing", + sources=(), show_label=True, height=LENGTH, width=LENGTH, - brush=False, layers=False, - crop_size="1:1", - ) - gr.Examples(example_target_imgs, [target], examples_per_page=20) - gr.Markdown( - """

② Hit the "Finish Cropping" button to get hand pose

""" - ) - target_finish_crop = gr.Button( - value="Finish Cropping", interactive=False - ) - with gr.Tab("Automatic hand keypoints"): - target_pose = gr.Image( - type="numpy", - label="Target Pose", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - ) - target_use_auto = gr.Button(value="Click here to use automatic, not manual", interactive=False, visible=True) - with gr.Tab("Manual hand keypoints"): - target_manual_checkbox_info = gr.Markdown( - """

Step 1. Tell us if this is right, left, or both hands.

""", - visible=True, - ) - target_manual_checkbox = gr.CheckboxGroup( - ["Right hand", "Left hand"], - show_label=False, - visible=True, - interactive=True, - ) - target_manual_kp_r_info = gr.Markdown( - """

Step 2. Click on image to provide hand keypoints for right hand. See \"OpenPose Keypoint Convention\" for guidance.

""", - visible=False, - ) - target_manual_kp_right = gr.Image( - type="numpy", - label="Keypoint Selection (right hand)", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - visible=False, - sources=[], - ) - with gr.Row(): - target_manual_undo_right = gr.Button( - value="Undo", interactive=True, visible=False - ) - target_manual_reset_right = gr.Button( - value="Reset", interactive=True, visible=False - ) - target_manual_kp_l_info = gr.Markdown( - """

Step 2. Click on image to provide hand keypoints for left hand. See \"OpenPose keypoint convention\" for guidance.

""", - visible=False - ) - target_manual_kp_left = gr.Image( - type="numpy", - label="Keypoint Selection (left hand)", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - visible=False, - sources=[], - ) - with gr.Row(): - target_manual_undo_left = gr.Button( - value="Undo", interactive=True, visible=False - ) - target_manual_reset_left = gr.Button( - value="Reset", interactive=True, visible=False - ) - target_manual_done_info = gr.Markdown( - """

Step 3. Hit \"Done\" button to confirm.

""", - visible=False, - ) - target_manual_done = gr.Button(value="Done", interactive=True, visible=False) - target_manual_pose = gr.Image( - type="numpy", - label="Target Pose", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - visible=False - ) - target_use_manual = gr.Button(value="Click here to use manual, not automatic", interactive=True, visible=False) - target_manual_instruct = gr.Markdown( - value="""

OpenPose Keypoints Convention

""", - visible=True - ) - target_manual_openpose = gr.Image( - value="openpose.png", - type="numpy", - show_label=False, - height=LENGTH // 2, - width=LENGTH // 2, - interactive=False, - visible=True - ) - gr.Markdown( - """

③ Optionally flip the hand

""" + transforms=("brush"), + brush=gr.Brush( + colors=["rgb(255, 255, 255)"], default_size=20 + ), # 204, 50, 50 + image_mode="RGBA", + container=False, + interactive=False, ) - target_flip = gr.Checkbox( - value=False, label="Flip Handedness (Target)", interactive=False + # gr.Markdown( + # """

③ Hit the \"Finish Cropping & Brushing\" button

""" + # ) + fix_finish_crop = gr.Button( + value="Finish Croping & Brushing", interactive=False ) - - # result column + + # keypoint selection with gr.Column(): gr.Markdown( - """

3. Press "Run" to get the edited results 🎯

""" + """

3. Click on hand to get target hand pose

""" ) - run = gr.Button(value="Run", interactive=False) gr.Markdown( - """

⚠️ ~20s per generation with RTX3090. ~50s with A100.
(For example, if you set Number of generations as 2, it would take around 40s)

""" + """

① Tell us if this is right, left, or both hands

""" ) - results = gr.Gallery( + fix_checkbox = gr.CheckboxGroup( + ["Right hand", "Left hand"], + show_label=False, + interactive=False, + ) + fix_kp_r_info = gr.Markdown( + """

② Click 21 keypoints on the image to provide the target hand pose of right hand. See the \"OpenPose keypoints convention\" for guidance.

""", + visible=False + ) + # fix_kp_r_info = gr.Markdown( + # """

Select right only

""", + # visible=False, + # ) + fix_kp_right = gr.Image( type="numpy", - label="Results", + label="Keypoint Selection (right hand)", show_label=True, height=LENGTH, - min_width=LENGTH, - columns=MAX_N, + width=LENGTH, interactive=False, - preview=True, + visible=False, + sources=[], + ) + with gr.Row(): + fix_undo_right = gr.Button( + value="Undo", interactive=False, visible=False + ) + fix_reset_right = gr.Button( + value="Reset", interactive=False, visible=False + ) + fix_kp_l_info = gr.Markdown( + """

② Click 21 keypoints on the image to provide the target hand pose of left hand. See the \"OpenPose keypoints convention\" for guidance.

""", + visible=False ) - results_pose = gr.Gallery( + fix_kp_left = gr.Image( type="numpy", - label="Results Pose", + label="Keypoint Selection (left hand)", show_label=True, height=LENGTH, - min_width=LENGTH, - columns=MAX_N, + width=LENGTH, interactive=False, - preview=True, + visible=False, + sources=[], ) + with gr.Row(): + fix_undo_left = gr.Button( + value="Undo", interactive=False, visible=False + ) + fix_reset_left = gr.Button( + value="Reset", interactive=False, visible=False + ) gr.Markdown( - """

✨ Hit "Clear" to restart from the beginning

""" + """

OpenPose keypoints convention

""" ) - clear = gr.ClearButton() + fix_openpose = gr.Image( + value="openpose.png", + type="numpy", + show_label=False, + height=LENGTH // 2, + width=LENGTH // 2, + interactive=False, + ) + + # get latent + # with gr.Column(): + + # result column + with gr.Column(): + gr.Markdown( + """

4. Press "Run" to get the corrected hand image 🎯

""" + ) + # gr.Markdown( + # """

3. Press "Ready" to start pre-processing

""" + # ) + # fix_ready = gr.Button(value="Ready", interactive=False) + # gr.Markdown( + # """

Visualized (256, 256)-resized, brushed image

""" + # ) + fix_vis_mask32 = gr.Image( + type="numpy", + label=f"Visualized {opts.latent_size} Inpaint Mask", + show_label=True, + height=opts.latent_size, + width=opts.latent_size, + interactive=False, + visible=False, + ) + fix_run = gr.Button(value="Run", interactive=False) + with gr.Accordion(label="Visualized (256, 256) resized, brushed image", open=False): + fix_vis_mask256 = gr.Image( + type="numpy", + show_label=False, + height=opts.image_size, + width=opts.image_size, + interactive=False, + visible=True, + ) + # gr.Markdown( + # """

[NOTE] Above should be inpaint mask that you brushed, NOT the segmentation mask of the entire hand.

""" + # ) + gr.Markdown( + """

⚠️ >3min and ~24GB per generation

""" + ) + fix_result_original = gr.Gallery( + type="numpy", + label="Results on original input", + show_label=True, + height=LENGTH, + min_width=LENGTH, + columns=FIX_MAX_N, + interactive=False, + preview=True, + ) + with gr.Accordion(label="Results of cropped area / Results with pose", open=False): + fix_result = gr.Gallery( + type="numpy", + label="Results", + show_label=True, + height=LENGTH, + min_width=LENGTH, + columns=FIX_MAX_N, + interactive=False, + preview=True, + ) + fix_result_pose = gr.Gallery( + type="numpy", + label="Results Pose", + show_label=True, + height=LENGTH, + min_width=LENGTH, + columns=FIX_MAX_N, + interactive=False, + preview=True, + ) + gr.Markdown( + """

✨ Hit "Clear" to restart from the beginning

""" + ) + fix_clear = gr.ClearButton() - with gr.Tab("More options"): + with gr.Accordion(label="More options", open=False): + gr.Markdown( + "⚠️ Currently, Number of generation > 1 could lead to out-of-memory" + ) with gr.Row(): - n_generation = gr.Slider( + fix_n_generation = gr.Slider( label="Number of generations", value=1, minimum=1, - maximum=MAX_N, + maximum=FIX_MAX_N, step=1, randomize=False, interactive=True, ) - seed = gr.Slider( + fix_seed = gr.Slider( label="Seed", value=42, minimum=0, @@ -1522,468 +1505,496 @@ with gr.Blocks(css=custom_css, theme="soft") as demo: randomize=False, interactive=True, ) - cfg = gr.Slider( + fix_cfg = gr.Slider( label="Classifier free guidance scale", - value=2.5, + value=3.0, minimum=0.0, maximum=10.0, step=0.1, randomize=False, interactive=True, ) - - # reference listeners - ref.change(enable_component, [ref, ref], ref_finish_crop) - ref_finish_crop.click(prepare_anno, [ref], [ref_im_raw, ref_kp_raw]) - ref_kp_raw.change(lambda x: x, ref_im_raw, ref_manual_kp_right) - ref_kp_raw.change(lambda x: x, ref_im_raw, ref_manual_kp_left) - ref_kp_raw.change(get_ref_anno, [ref_im_raw, ref_kp_raw], [ref_img, ref_pose, ref_auto_cond]) - ref_pose.change(enable_component, [ref_kp_raw, ref_pose], ref_use_auto) - ref_pose.change(enable_component, [ref_img, ref_pose], ref_flip) - ref_auto_cond.change(lambda x: x, ref_auto_cond, ref_cond) - ref_use_auto.click(lambda x: x, ref_auto_cond, ref_cond) - ref_use_auto.click(lambda x: gr.Info("Automatic hand keypoints will be used for 'Reference'", duration=3)) + fix_quality = gr.Slider( + label="Quality", + value=10, + minimum=1, + maximum=10, + step=1, + randomize=False, + interactive=True, + ) - ref_manual_checkbox.select( - set_visible, - [ref_manual_checkbox, ref_kp_got, ref_im_raw, ref_manual_kp_right, ref_manual_kp_left, ref_manual_done], - [ - ref_kp_got, - ref_manual_kp_right, - ref_manual_kp_left, - ref_manual_kp_right, - ref_manual_undo_right, - ref_manual_reset_right, - ref_manual_kp_left, - ref_manual_undo_left, - ref_manual_reset_left, - ref_manual_kp_r_info, - ref_manual_kp_l_info, - ref_manual_done, - ref_manual_done_info - ] + # listeners + # fix_crop.change(resize_to_full, fix_crop, fix_ref) + fix_crop.change(lambda x: x, fix_crop, fix_original) # fix_original: (real_H, real_W, 3) + fix_crop.change(stay_crop, [fix_crop, fix_crop_coord], [fix_crop_coord, fix_ref]) + fix_crop.select(process_crop, [fix_crop, fix_crop_coord], [fix_crop_coord, fix_ref]) + # fix_ref.change(disable_crop, fix_crop_coord, fix_crop) + fix_ref.change(enable_component, [fix_crop, fix_crop], fix_ref) + fix_ref.change(enable_component, [fix_crop, fix_crop], fix_finish_crop) + fix_finish_crop.click(visualize_ref, [fix_ref], [fix_img, fix_inpaint_mask]) + # fix_finish_crop.click(get_mask_inpaint, [fix_ref], []) # fix_ref: (real_cropped_H, real_cropped_W, 3) + fix_img.change(lambda x: x, [fix_img], [fix_kp_right]) + fix_img.change(lambda x: x, [fix_img], [fix_kp_left]) + fix_inpaint_mask.change( + enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_checkbox ) - ref_manual_kp_right.select( - get_kps, [ref_im_raw, ref_kp_got, gr.State("right")], [ref_manual_kp_right, ref_kp_got] + fix_inpaint_mask.change( + enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_kp_right ) - ref_manual_undo_right.click( - undo_kps, [ref_im_raw, ref_kp_got, gr.State("right")], [ref_manual_kp_right, ref_kp_got] + fix_inpaint_mask.change( + enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_undo_right ) - ref_manual_reset_right.click( - reset_kps, [ref_im_raw, ref_kp_got, gr.State("right")], [ref_manual_kp_right, ref_kp_got] + fix_inpaint_mask.change( + enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_reset_right ) - ref_manual_kp_left.select( - get_kps, [ref_im_raw, ref_kp_got, gr.State("left")], [ref_manual_kp_left, ref_kp_got] + fix_inpaint_mask.change( + enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_kp_left ) - ref_manual_undo_left.click( - undo_kps, [ref_im_raw, ref_kp_got, gr.State("left")], [ref_manual_kp_left, ref_kp_got] + fix_inpaint_mask.change( + enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_undo_left ) - ref_manual_reset_left.click( - reset_kps, [ref_im_raw, ref_kp_got, gr.State("left")], [ref_manual_kp_left, ref_kp_got] + fix_inpaint_mask.change( + enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_reset_left ) - ref_manual_done.click(visible_component, [gr.State(0), ref_manual_pose], ref_manual_pose) - ref_manual_done.click(visible_component, [gr.State(0), ref_use_manual], ref_use_manual) - ref_manual_done.click(get_ref_anno, [ref_im_raw, ref_kp_got], [ref_img, ref_manual_pose, ref_manual_cond]) - ref_manual_pose.change(enable_component, [ref_manual_pose, ref_manual_pose], ref_manual_done) - ref_manual_pose.change(enable_component, [ref_img, ref_manual_pose], ref_flip) - ref_manual_cond.change(lambda x: x, ref_manual_cond, ref_cond) - ref_use_manual.click(lambda x: x, ref_manual_cond, ref_cond) - ref_use_manual.click(lambda x: gr.Info("Manual hand keypoints will be used for 'Reference'", duration=3)) - - ref_flip.select( - flip_hand, - [ref, ref_im_raw, ref_pose, ref_manual_pose, ref_manual_kp_right, ref_manual_kp_left, ref_cond, ref_auto_cond, ref_manual_cond], - [ref, ref_im_raw, ref_pose, ref_manual_pose, ref_manual_kp_right, ref_manual_kp_left, ref_cond, ref_auto_cond, ref_manual_cond] + fix_inpaint_mask.change( + enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_run ) - - # target listeners - target.change(enable_component, [target, target], target_finish_crop) - target_finish_crop.click(prepare_anno, [target], [target_im_raw, target_kp_raw]) - target_kp_raw.change(lambda x:x, target_im_raw, target_manual_kp_right) - target_kp_raw.change(lambda x:x, target_im_raw, target_manual_kp_left) - target_kp_raw.change(get_target_anno, [target_im_raw, target_kp_raw], [target_img, target_pose, target_auto_cond, target_auto_keypts]) - target_pose.change(enable_component, [target_kp_raw, target_pose], target_use_auto) - target_pose.change(enable_component, [target_img, target_pose], target_flip) - target_auto_cond.change(lambda x: x, target_auto_cond, target_cond) - target_auto_keypts.change(lambda x: x, target_auto_keypts, target_keypts) - target_use_auto.click(lambda x: x, target_auto_cond, target_cond) - target_use_auto.click(lambda x: x, target_auto_keypts, target_keypts) - target_use_auto.click(lambda x: gr.Info("Automatic hand keypoints will be used for 'Target'", duration=3)) - - target_manual_checkbox.select( + fix_checkbox.select( set_visible, - [target_manual_checkbox, target_kp_got, target_im_raw, target_manual_kp_right, target_manual_kp_left, target_manual_done], + [fix_checkbox, fix_kpts, fix_img, fix_kp_right, fix_kp_left], [ - target_kp_got, - target_manual_kp_right, - target_manual_kp_left, - target_manual_kp_right, - target_manual_undo_right, - target_manual_reset_right, - target_manual_kp_left, - target_manual_undo_left, - target_manual_reset_left, - target_manual_kp_r_info, - target_manual_kp_l_info, - target_manual_done, - target_manual_done_info - ] - ) - target_manual_kp_right.select( - get_kps, [target_im_raw, target_kp_got, gr.State("right")], [target_manual_kp_right, target_kp_got] + fix_kpts, + fix_kp_right, + fix_kp_left, + fix_kp_right, + fix_undo_right, + fix_reset_right, + fix_kp_left, + fix_undo_left, + fix_reset_left, + fix_kp_r_info, + fix_kp_l_info, + ], ) - target_manual_undo_right.click( - undo_kps, [target_im_raw, target_kp_got, gr.State("right")], [target_manual_kp_right, target_kp_got] + fix_kp_right.select( + get_kps, [fix_img, fix_kpts, gr.State("right")], [fix_kp_right, fix_kpts] # fix_img: (real_cropped_H, real_cropped_W, 3) ) - target_manual_reset_right.click( - reset_kps, [target_im_raw, target_kp_got, gr.State("right")], [target_manual_kp_right, target_kp_got] + fix_undo_right.click( + undo_kps, [fix_img, fix_kpts, gr.State("right")], [fix_kp_right, fix_kpts] ) - target_manual_kp_left.select( - get_kps, [target_im_raw, target_kp_got, gr.State("left")], [target_manual_kp_left, target_kp_got] + fix_reset_right.click( + reset_kps, [fix_img, fix_kpts, gr.State("right")], [fix_kp_right, fix_kpts] ) - target_manual_undo_left.click( - undo_kps, [target_im_raw, target_kp_got, gr.State("left")], [target_manual_kp_left, target_kp_got] + fix_kp_left.select( + get_kps, [fix_img, fix_kpts, gr.State("left")], [fix_kp_left, fix_kpts] ) - target_manual_reset_left.click( - reset_kps, [target_im_raw, target_kp_got, gr.State("left")], [target_manual_kp_left, target_kp_got] + fix_undo_left.click( + undo_kps, [fix_img, fix_kpts, gr.State("left")], [fix_kp_left, fix_kpts] ) - target_manual_done.click(visible_component, [gr.State(0), target_manual_pose], target_manual_pose) - target_manual_done.click(visible_component, [gr.State(0), target_use_manual], target_use_manual) - target_manual_done.click(get_target_anno, [target_im_raw, target_kp_got], [target_img, target_manual_pose, target_manual_cond, target_manual_keypts]) - target_manual_pose.change(enable_component, [target_manual_pose, target_manual_pose], target_manual_done) - target_manual_pose.change(enable_component, [target_img, target_manual_pose], target_flip) - target_manual_cond.change(lambda x: x, target_manual_cond, target_cond) - target_manual_keypts.change(lambda x: x, target_manual_keypts, target_keypts) - target_use_manual.click(lambda x: x, target_manual_cond, target_cond) - target_use_manual.click(lambda x: x, target_manual_keypts, target_keypts) - target_use_manual.click(lambda x: gr.Info("Manual hand keypoints will be used for 'Reference'", duration=3)) - - target_flip.select( - flip_hand, - [target, target_im_raw, target_pose, target_manual_pose, target_manual_kp_right, target_manual_kp_left, target_cond, target_auto_cond, target_manual_cond, target_keypts, target_auto_keypts, target_manual_keypts], - [target, target_im_raw, target_pose, target_manual_pose, target_manual_kp_right, target_manual_kp_left, target_cond, target_auto_cond, target_manual_cond, target_keypts, target_auto_keypts, target_manual_keypts], + fix_reset_left.click( + reset_kps, [fix_img, fix_kpts, gr.State("left")], [fix_kp_left, fix_kpts] ) - - # run listerners - ref_cond.change(enable_component, [ref_cond, target_cond], run) - target_cond.change(enable_component, [ref_cond, target_cond], run) - # ref_manual_pose.change(enable_component, [ref_manual_pose, target_manual_pose], run) - # target_manual_pose.change(enable_component, [ref_manual_pose, target_manual_pose], run) - run.click( - sample_diff, - [ref_cond, target_cond, target_keypts, n_generation, seed, cfg], - [results, results_pose], + # fix_vis_mask32.change( + # enable_component, [fix_vis_mask32, fix_vis_mask256], fix_run + # ) + fix_run.click( + ready_sample, + [fix_ref, fix_inpaint_mask, fix_kpts], + [ + fix_ref_cond, + fix_target_cond, + fix_latent, + fix_inpaint_latent, + fix_kpts_np, + fix_vis_mask32, + fix_vis_mask256, + ], ) - clear.click( - clear_all, + fix_kpts_np.change( + sample_inpaint, + [ + fix_ref_cond, + fix_target_cond, + fix_latent, + fix_inpaint_latent, + fix_kpts_np, + fix_original, + fix_crop_coord, + fix_n_generation, + fix_seed, + fix_cfg, + fix_quality, + ], + [fix_result, fix_result_pose, fix_result_original], + ) + fix_clear.click( + fix_clear_all, [], [ - ref, - ref_manual_checkbox, - ref_manual_kp_right, - ref_manual_kp_left, - ref_img, - ref_pose, - ref_manual_pose, - ref_cond, - ref_flip, - target, - target_keypts, - target_manual_checkbox, - target_manual_kp_right, - target_manual_kp_left, - target_img, - target_pose, - target_manual_pose, - target_cond, - target_flip, - results, - results_pose, - n_generation, - seed, - cfg, - ref_kp_raw, + fix_crop, + fix_crop_coord, + fix_ref, + fix_checkbox, + fix_kp_right, + fix_kp_left, + fix_result, + fix_result_pose, + fix_result_original, + fix_inpaint_mask, + fix_original, + fix_img, + fix_vis_mask32, + fix_vis_mask256, + fix_kpts, + fix_kpts_np, + fix_ref_cond, + fix_target_cond, + fix_latent, + fix_inpaint_latent, + fix_n_generation, + fix_seed, + fix_cfg, + fix_quality, ], ) - clear.click( - set_unvisible, + fix_clear.click( + fix_set_unvisible, [], [ - ref_manual_kp_l_info, - ref_manual_kp_r_info, - ref_manual_kp_left, - ref_manual_kp_right, - ref_manual_undo_left, - ref_manual_undo_right, - ref_manual_reset_left, - ref_manual_reset_right, - ref_manual_done, - ref_manual_done_info, - ref_manual_pose, - ref_use_manual, - target_manual_kp_l_info, - target_manual_kp_r_info, - target_manual_kp_left, - target_manual_kp_right, - target_manual_undo_left, - target_manual_undo_right, - target_manual_reset_left, - target_manual_reset_right, - target_manual_done, - target_manual_done_info, - target_manual_pose, - target_use_manual, + fix_kp_right, + fix_kp_left, + fix_kp_r_info, + fix_kp_l_info, + fix_undo_left, + fix_undo_right, + fix_reset_left, + fix_reset_right ] ) - with gr.Tab("Fix Hands"): - fix_inpaint_mask = gr.State(value=None) - fix_original = gr.State(value=None) - fix_crop_coord = gr.State(value=None) - fix_img = gr.State(value=None) - fix_kpts = gr.State(value=None) - fix_kpts_np = gr.State(value=None) - fix_ref_cond = gr.State(value=None) - fix_target_cond = gr.State(value=None) - fix_latent = gr.State(value=None) - fix_inpaint_latent = gr.State(value=None) + with gr.Tab("Demo 2. Repose Hands", elem_id="repose-tab"): + # gr.Markdown("""

Demo 2. Repose Hands

""") + dump = gr.State(value=None) + + # ref states + ref_img = gr.State(value=None) + ref_im_raw = gr.State(value=None) + ref_kp_raw = gr.State(value=0) + ref_is_user = gr.State(value=True) + ref_kp_got = gr.State(value=None) + ref_manual_cond = gr.State(value=None) + ref_auto_cond = gr.State(value=None) + ref_cond = gr.State(value=None) + + # target states + target_img = gr.State(value=None) + target_im_raw = gr.State(value=None) + target_kp_raw = gr.State(value=0) + target_is_user = gr.State(value=True) + target_kp_got = gr.State(value=None) + target_manual_keypts = gr.State(value=None) + target_auto_keypts = gr.State(value=None) + target_keypts = gr.State(value=None) + target_manual_cond = gr.State(value=None) + target_auto_cond = gr.State(value=None) + target_cond = gr.State(value=None) + + # main tab with gr.Row(): - # crop & brush + # ref column with gr.Column(): gr.Markdown( - """

1. Upload a malformed hand image to fix πŸ“₯

""" + """

1. Upload a hand image to repose πŸ“₯

""" ) gr.Markdown( - """

① Optionally crop the image by clicking top left and bottom right of your desired bounding box around the hand.

""" + """

Optionally crop the image

""" ) - # fix_crop = gr.ImageEditor( - # type="numpy", - # sources=["upload", "webcam", "clipboard"], - # label="Image crop", - # show_label=True, - # height=LENGTH, - # width=LENGTH, - # layers=False, - # # crop_size="1:1", - # transforms=(), - # brush=False, - # image_mode="RGBA", - # container=False, - # ) - fix_crop = gr.Image( + ref = gr.ImageEditor( type="numpy", - sources=["upload", "webcam", "clipboard"], - label="Input Image", + label="Reference", show_label=True, height=LENGTH, width=LENGTH, - interactive=True, - visible=True, - ) - gr.Markdown( - """

πŸ’‘ If you crop, the model can focus on more details of the cropped area. Square crops might work better than rectangle crops.

""" + brush=False, + layers=False, + crop_size="1:1", ) - # fix_tmp = gr.Image( - # type="numpy", - # label="tmp", - # show_label=True, - # height=LENGTH, - # width=LENGTH, - # interactive=True, - # visible=True, - # sources=[], + gr.Examples(example_ref_imgs, [ref], examples_per_page=20) + # gr.Markdown( + # """

② Hit the "Finish Cropping" button to get hand pose

""" # ) - fix_example = gr.Examples( - fix_example_imgs, - inputs=[fix_crop], - examples_per_page=20, + # ref_finish_crop = gr.Button(value="Finish Cropping", interactive=False) + with gr.Accordion(label="See hand pose and more options", open=False): + with gr.Tab("Automatic hand keypoints"): + ref_pose = gr.Image( + type="numpy", + label="Reference Pose", + show_label=True, + height=LENGTH, + width=LENGTH, + interactive=False, + ) + ref_use_auto = gr.Button(value="Click here to use automatic, not manual", interactive=False, visible=True) + with gr.Tab("Manual hand keypoints"): + ref_manual_checkbox_info = gr.Markdown( + """

Step 1. Tell us if this is right, left, or both hands.

""", + visible=True, + ) + ref_manual_checkbox = gr.CheckboxGroup( + ["Right hand", "Left hand"], + show_label=False, + visible=True, + interactive=True, + ) + ref_manual_kp_r_info = gr.Markdown( + """

Step 2. Click on image to provide hand keypoints for right hand. See \"OpenPose Keypoint Convention\" for guidance.

""", + visible=False, + ) + ref_manual_kp_right = gr.Image( + type="numpy", + label="Keypoint Selection (right hand)", + show_label=True, + height=LENGTH, + width=LENGTH, + interactive=False, + visible=False, + sources=[], + ) + with gr.Row(): + ref_manual_undo_right = gr.Button( + value="Undo", interactive=True, visible=False + ) + ref_manual_reset_right = gr.Button( + value="Reset", interactive=True, visible=False + ) + ref_manual_kp_l_info = gr.Markdown( + """

Step 2. Click on image to provide hand keypoints for left hand. See \"OpenPose keypoint convention\" for guidance.

""", + visible=False + ) + ref_manual_kp_left = gr.Image( + type="numpy", + label="Keypoint Selection (left hand)", + show_label=True, + height=LENGTH, + width=LENGTH, + interactive=False, + visible=False, + sources=[], + ) + with gr.Row(): + ref_manual_undo_left = gr.Button( + value="Undo", interactive=True, visible=False + ) + ref_manual_reset_left = gr.Button( + value="Reset", interactive=True, visible=False + ) + ref_manual_done_info = gr.Markdown( + """

Step 3. Hit \"Done\" button to confirm.

""", + visible=False, + ) + ref_manual_done = gr.Button(value="Done", interactive=True, visible=False) + ref_manual_pose = gr.Image( + type="numpy", + label="Reference Pose", + show_label=True, + height=LENGTH, + width=LENGTH, + interactive=False, + visible=False + ) + ref_use_manual = gr.Button(value="Click here to use manual, not automatic", interactive=True, visible=False) + ref_manual_instruct = gr.Markdown( + value="""

OpenPose Keypoints Convention

""", + visible=True + ) + ref_manual_openpose = gr.Image( + value="openpose.png", + type="numpy", + show_label=False, + height=LENGTH // 2, + width=LENGTH // 2, + interactive=False, + visible=True + ) + gr.Markdown( + """

Optionally flip the hand

""" + ) + ref_flip = gr.Checkbox( + value=False, label="Flip Handedness (Reference)", interactive=False + ) + + # target column + with gr.Column(): + gr.Markdown( + """

2. Upload a hand image for target hand pose πŸ“₯

""" ) gr.Markdown( - """

② Brush area (e.g., wrong finger) that needs to be fixed. Don't brush the entire hand!

""" + """

Optionally crop the image

""" ) - fix_ref = gr.ImageEditor( + target = gr.ImageEditor( type="numpy", - label="Image Brushing", - sources=(), + label="Target", show_label=True, height=LENGTH, width=LENGTH, + brush=False, layers=False, - transforms=("brush"), - brush=gr.Brush( - colors=["rgb(255, 255, 255)"], default_size=20 - ), # 204, 50, 50 - image_mode="RGBA", - container=False, - interactive=False, - ) - gr.Markdown( - """

③ Hit the \"Finish Cropping & Brushing\" button

""" - ) - fix_finish_crop = gr.Button( - value="Finish Croping & Brushing", interactive=False + crop_size="1:1", ) - - # keypoint selection + gr.Examples(example_target_imgs, [target], examples_per_page=20) + # gr.Markdown( + # """

② Hit the "Finish Cropping" button to get hand pose

""" + # ) + # target_finish_crop = gr.Button( + # value="Finish Cropping", interactive=False + # ) + with gr.Accordion(label="See hand pose and more options", open=False): + with gr.Tab("Automatic hand keypoints"): + target_pose = gr.Image( + type="numpy", + label="Target Pose", + show_label=True, + height=LENGTH, + width=LENGTH, + interactive=False, + ) + target_use_auto = gr.Button(value="Click here to use automatic, not manual", interactive=False, visible=True) + with gr.Tab("Manual hand keypoints"): + target_manual_checkbox_info = gr.Markdown( + """

Step 1. Tell us if this is right, left, or both hands.

""", + visible=True, + ) + target_manual_checkbox = gr.CheckboxGroup( + ["Right hand", "Left hand"], + show_label=False, + visible=True, + interactive=True, + ) + target_manual_kp_r_info = gr.Markdown( + """

Step 2. Click on image to provide hand keypoints for right hand. See \"OpenPose Keypoint Convention\" for guidance.

""", + visible=False, + ) + target_manual_kp_right = gr.Image( + type="numpy", + label="Keypoint Selection (right hand)", + show_label=True, + height=LENGTH, + width=LENGTH, + interactive=False, + visible=False, + sources=[], + ) + with gr.Row(): + target_manual_undo_right = gr.Button( + value="Undo", interactive=True, visible=False + ) + target_manual_reset_right = gr.Button( + value="Reset", interactive=True, visible=False + ) + target_manual_kp_l_info = gr.Markdown( + """

Step 2. Click on image to provide hand keypoints for left hand. See \"OpenPose keypoint convention\" for guidance.

""", + visible=False + ) + target_manual_kp_left = gr.Image( + type="numpy", + label="Keypoint Selection (left hand)", + show_label=True, + height=LENGTH, + width=LENGTH, + interactive=False, + visible=False, + sources=[], + ) + with gr.Row(): + target_manual_undo_left = gr.Button( + value="Undo", interactive=True, visible=False + ) + target_manual_reset_left = gr.Button( + value="Reset", interactive=True, visible=False + ) + target_manual_done_info = gr.Markdown( + """

Step 3. Hit \"Done\" button to confirm.

""", + visible=False, + ) + target_manual_done = gr.Button(value="Done", interactive=True, visible=False) + target_manual_pose = gr.Image( + type="numpy", + label="Target Pose", + show_label=True, + height=LENGTH, + width=LENGTH, + interactive=False, + visible=False + ) + target_use_manual = gr.Button(value="Click here to use manual, not automatic", interactive=True, visible=False) + target_manual_instruct = gr.Markdown( + value="""

OpenPose Keypoints Convention

""", + visible=True + ) + target_manual_openpose = gr.Image( + value="openpose.png", + type="numpy", + show_label=False, + height=LENGTH // 2, + width=LENGTH // 2, + interactive=False, + visible=True + ) + gr.Markdown( + """

Optionally flip the hand

""" + ) + target_flip = gr.Checkbox( + value=False, label="Flip Handedness (Target)", interactive=False + ) + + # result column with gr.Column(): gr.Markdown( - """

2. Click on hand to get target hand pose

""" + """

3. Press "Run" to get the reposed results 🎯

""" ) + run = gr.Button(value="Run", interactive=False) gr.Markdown( - """

① Tell us if this is right, left, or both hands

""" + """

⚠️ ~20s per generation with RTX3090. ~50s with A100.
(For example, if you set Number of generations as 2, it would take around 40s)

""" ) - fix_checkbox = gr.CheckboxGroup( - ["Right hand", "Left hand"], - show_label=False, - interactive=False, - ) - fix_kp_r_info = gr.Markdown( - """

② Click 21 keypoints on the image to provide the target hand pose of right hand. See the \"OpenPose keypoints convention\" for guidance.

""", - visible=False - ) - # fix_kp_r_info = gr.Markdown( - # """

Select right only

""", - # visible=False, - # ) - fix_kp_right = gr.Image( - type="numpy", - label="Keypoint Selection (right hand)", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - visible=False, - sources=[], - ) - with gr.Row(): - fix_undo_right = gr.Button( - value="Undo", interactive=False, visible=False - ) - fix_reset_right = gr.Button( - value="Reset", interactive=False, visible=False - ) - fix_kp_l_info = gr.Markdown( - """

② Click 21 keypoints on the image to provide the target hand pose of left hand. See the \"OpenPose keypoints convention\" for guidance.

""", - visible=False - ) - fix_kp_left = gr.Image( - type="numpy", - label="Keypoint Selection (left hand)", - show_label=True, - height=LENGTH, - width=LENGTH, - interactive=False, - visible=False, - sources=[], - ) - with gr.Row(): - fix_undo_left = gr.Button( - value="Undo", interactive=False, visible=False - ) - fix_reset_left = gr.Button( - value="Reset", interactive=False, visible=False - ) - gr.Markdown( - """

OpenPose keypoints convention

""" - ) - fix_openpose = gr.Image( - value="openpose.png", - type="numpy", - show_label=False, - height=LENGTH // 2, - width=LENGTH // 2, - interactive=False, - ) - - # get latent - with gr.Column(): - gr.Markdown( - """

3. Press "Ready" to start pre-processing

""" - ) - fix_ready = gr.Button(value="Ready", interactive=False) - gr.Markdown( - """

Visualized (256, 256)-resized, brushed image

""" - ) - fix_vis_mask32 = gr.Image( - type="numpy", - label=f"Visualized {opts.latent_size} Inpaint Mask", - show_label=True, - height=opts.latent_size, - width=opts.latent_size, - interactive=False, - visible=False, - ) - fix_vis_mask256 = gr.Image( - type="numpy", - visible=True, - show_label=False, - height=opts.image_size, - width=opts.image_size, - interactive=False, - ) - # gr.Markdown( - # """

[NOTE] Above should be inpaint mask that you brushed, NOT the segmentation mask of the entire hand.

""" - # ) - - # result column - with gr.Column(): - gr.Markdown( - """

4. Press "Run" to get the fixed hand image 🎯

""" - ) - fix_run = gr.Button(value="Run", interactive=False) - gr.Markdown( - """

⚠️ >3min and ~24GB per generation

""" - ) - fix_result_original = gr.Gallery( - type="numpy", - label="Results on original input", - show_label=True, - height=LENGTH, - min_width=LENGTH, - columns=FIX_MAX_N, - interactive=False, - preview=True, - ) - fix_result = gr.Gallery( + results = gr.Gallery( type="numpy", label="Results", show_label=True, height=LENGTH, min_width=LENGTH, - columns=FIX_MAX_N, - interactive=False, - preview=True, - ) - fix_result_pose = gr.Gallery( - type="numpy", - label="Results Pose", - show_label=True, - height=LENGTH, - min_width=LENGTH, - columns=FIX_MAX_N, + columns=MAX_N, interactive=False, preview=True, ) + with gr.Accordion(label="Results with pose", open=False): + results_pose = gr.Gallery( + type="numpy", + label="Results Pose", + show_label=True, + height=LENGTH, + min_width=LENGTH, + columns=MAX_N, + interactive=False, + preview=True, + ) gr.Markdown( """

✨ Hit "Clear" to restart from the beginning

""" ) - fix_clear = gr.ClearButton() + clear = gr.ClearButton() - with gr.Tab("More options"): - gr.Markdown( - "⚠️ Currently, Number of generation > 1 could lead to out-of-memory" - ) + with gr.Accordion(label="More options", open=False): with gr.Row(): - fix_n_generation = gr.Slider( + n_generation = gr.Slider( label="Number of generations", value=1, minimum=1, - maximum=FIX_MAX_N, + maximum=MAX_N, step=1, randomize=False, interactive=True, ) - fix_seed = gr.Slider( + seed = gr.Slider( label="Seed", value=42, minimum=0, @@ -1992,174 +2003,224 @@ with gr.Blocks(css=custom_css, theme="soft") as demo: randomize=False, interactive=True, ) - fix_cfg = gr.Slider( + cfg = gr.Slider( label="Classifier free guidance scale", - value=3.0, + value=2.5, minimum=0.0, maximum=10.0, step=0.1, randomize=False, interactive=True, ) - fix_quality = gr.Slider( - label="Quality", - value=10, - minimum=1, - maximum=10, - step=1, - randomize=False, - interactive=True, - ) + + # reference listeners + # ref.change(enable_component, [ref, ref], ref_finish_crop) + ref.change(prepare_anno, [ref, ref_is_user], [ref_im_raw, ref_kp_raw]) + ref_kp_raw.change(lambda x: x, ref_im_raw, ref_manual_kp_right) + ref_kp_raw.change(lambda x: x, ref_im_raw, ref_manual_kp_left) + ref_kp_raw.change(get_ref_anno, [ref_im_raw, ref_kp_raw], [ref_img, ref_pose, ref_auto_cond, ref, ref_is_user]) + ref_pose.change(enable_component, [ref_kp_raw, ref_pose], ref_use_auto) + ref_pose.change(enable_component, [ref_img, ref_pose], ref_flip) + ref_auto_cond.change(lambda x: x, ref_auto_cond, ref_cond) + ref_use_auto.click(lambda x: x, ref_auto_cond, ref_cond) + ref_use_auto.click(lambda x: gr.Info("Automatic hand keypoints will be used for 'Reference'", duration=3)) - # listeners - # fix_crop.change(resize_to_full, fix_crop, fix_ref) - fix_crop.change(lambda x: x, fix_crop, fix_original) # fix_original: (real_H, real_W, 3) - fix_crop.change(stay_crop, [fix_crop, fix_crop_coord], [fix_crop_coord, fix_ref]) - fix_crop.select(process_crop, [fix_crop, fix_crop_coord], [fix_crop_coord, fix_ref]) - # fix_ref.change(disable_crop, fix_crop_coord, fix_crop) - fix_ref.change(enable_component, [fix_crop, fix_crop], fix_ref) - fix_ref.change(enable_component, [fix_crop, fix_crop], fix_finish_crop) - fix_finish_crop.click(visualize_ref, [fix_ref], [fix_img]) - fix_finish_crop.click(get_mask_inpaint, [fix_ref], [fix_inpaint_mask]) # fix_ref: (real_cropped_H, real_cropped_W, 3) - fix_img.change(lambda x: x, [fix_img], [fix_kp_right]) - fix_img.change(lambda x: x, [fix_img], [fix_kp_left]) - fix_inpaint_mask.change( - enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_checkbox + ref_manual_checkbox.select( + set_visible, + [ref_manual_checkbox, ref_kp_got, ref_im_raw, ref_manual_kp_right, ref_manual_kp_left, ref_manual_done], + [ + ref_kp_got, + ref_manual_kp_right, + ref_manual_kp_left, + ref_manual_kp_right, + ref_manual_undo_right, + ref_manual_reset_right, + ref_manual_kp_left, + ref_manual_undo_left, + ref_manual_reset_left, + ref_manual_kp_r_info, + ref_manual_kp_l_info, + ref_manual_done, + ref_manual_done_info + ] ) - fix_inpaint_mask.change( - enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_kp_right + ref_manual_kp_right.select( + get_kps, [ref_im_raw, ref_kp_got, gr.State("right")], [ref_manual_kp_right, ref_kp_got] ) - fix_inpaint_mask.change( - enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_undo_right + ref_manual_undo_right.click( + undo_kps, [ref_im_raw, ref_kp_got, gr.State("right")], [ref_manual_kp_right, ref_kp_got] ) - fix_inpaint_mask.change( - enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_reset_right + ref_manual_reset_right.click( + reset_kps, [ref_im_raw, ref_kp_got, gr.State("right")], [ref_manual_kp_right, ref_kp_got] ) - fix_inpaint_mask.change( - enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_kp_left + ref_manual_kp_left.select( + get_kps, [ref_im_raw, ref_kp_got, gr.State("left")], [ref_manual_kp_left, ref_kp_got] ) - fix_inpaint_mask.change( - enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_undo_left + ref_manual_undo_left.click( + undo_kps, [ref_im_raw, ref_kp_got, gr.State("left")], [ref_manual_kp_left, ref_kp_got] ) - fix_inpaint_mask.change( - enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_reset_left + ref_manual_reset_left.click( + reset_kps, [ref_im_raw, ref_kp_got, gr.State("left")], [ref_manual_kp_left, ref_kp_got] ) - fix_inpaint_mask.change( - enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_ready + ref_manual_done.click(visible_component, [gr.State(0), ref_manual_pose], ref_manual_pose) + ref_manual_done.click(visible_component, [gr.State(0), ref_use_manual], ref_use_manual) + ref_manual_done.click(get_ref_anno, [ref_im_raw, ref_kp_got], [ref_img, ref_manual_pose, ref_manual_cond]) + ref_manual_pose.change(enable_component, [ref_manual_pose, ref_manual_pose], ref_manual_done) + ref_manual_pose.change(enable_component, [ref_img, ref_manual_pose], ref_flip) + ref_manual_cond.change(lambda x: x, ref_manual_cond, ref_cond) + ref_use_manual.click(lambda x: x, ref_manual_cond, ref_cond) + ref_use_manual.click(lambda x: gr.Info("Manual hand keypoints will be used for 'Reference'", duration=3)) + + ref_flip.select( + flip_hand, + [ref, ref_im_raw, ref_pose, ref_manual_pose, ref_manual_kp_right, ref_manual_kp_left, ref_cond, ref_auto_cond, ref_manual_cond], + [ref, ref_im_raw, ref_pose, ref_manual_pose, ref_manual_kp_right, ref_manual_kp_left, ref_cond, ref_auto_cond, ref_manual_cond] ) - fix_checkbox.select( + + # target listeners + # target.change(enable_component, [target, target], target_finish_crop) + target.change(prepare_anno, [target, target_is_user], [target_im_raw, target_kp_raw]) + target_kp_raw.change(lambda x:x, target_im_raw, target_manual_kp_right) + target_kp_raw.change(lambda x:x, target_im_raw, target_manual_kp_left) + target_kp_raw.change(get_target_anno, [target_im_raw, target_kp_raw], [target_img, target_pose, target_auto_cond, target_auto_keypts, target, target_is_user]) + target_pose.change(enable_component, [target_kp_raw, target_pose], target_use_auto) + target_pose.change(enable_component, [target_img, target_pose], target_flip) + target_auto_cond.change(lambda x: x, target_auto_cond, target_cond) + target_auto_keypts.change(lambda x: x, target_auto_keypts, target_keypts) + target_use_auto.click(lambda x: x, target_auto_cond, target_cond) + target_use_auto.click(lambda x: x, target_auto_keypts, target_keypts) + target_use_auto.click(lambda x: gr.Info("Automatic hand keypoints will be used for 'Target'", duration=3)) + + target_manual_checkbox.select( set_visible, - [fix_checkbox, fix_kpts, fix_img, fix_kp_right, fix_kp_left], + [target_manual_checkbox, target_kp_got, target_im_raw, target_manual_kp_right, target_manual_kp_left, target_manual_done], [ - fix_kpts, - fix_kp_right, - fix_kp_left, - fix_kp_right, - fix_undo_right, - fix_reset_right, - fix_kp_left, - fix_undo_left, - fix_reset_left, - fix_kp_r_info, - fix_kp_l_info, - ], - ) - fix_kp_right.select( - get_kps, [fix_img, fix_kpts, gr.State("right")], [fix_kp_right, fix_kpts] # fix_img: (real_cropped_H, real_cropped_W, 3) + target_kp_got, + target_manual_kp_right, + target_manual_kp_left, + target_manual_kp_right, + target_manual_undo_right, + target_manual_reset_right, + target_manual_kp_left, + target_manual_undo_left, + target_manual_reset_left, + target_manual_kp_r_info, + target_manual_kp_l_info, + target_manual_done, + target_manual_done_info + ] ) - fix_undo_right.click( - undo_kps, [fix_img, fix_kpts, gr.State("right")], [fix_kp_right, fix_kpts] + target_manual_kp_right.select( + get_kps, [target_im_raw, target_kp_got, gr.State("right")], [target_manual_kp_right, target_kp_got] ) - fix_reset_right.click( - reset_kps, [fix_img, fix_kpts, gr.State("right")], [fix_kp_right, fix_kpts] + target_manual_undo_right.click( + undo_kps, [target_im_raw, target_kp_got, gr.State("right")], [target_manual_kp_right, target_kp_got] ) - fix_kp_left.select( - get_kps, [fix_img, fix_kpts, gr.State("left")], [fix_kp_left, fix_kpts] + target_manual_reset_right.click( + reset_kps, [target_im_raw, target_kp_got, gr.State("right")], [target_manual_kp_right, target_kp_got] ) - fix_undo_left.click( - undo_kps, [fix_img, fix_kpts, gr.State("left")], [fix_kp_left, fix_kpts] + target_manual_kp_left.select( + get_kps, [target_im_raw, target_kp_got, gr.State("left")], [target_manual_kp_left, target_kp_got] ) - fix_reset_left.click( - reset_kps, [fix_img, fix_kpts, gr.State("left")], [fix_kp_left, fix_kpts] + target_manual_undo_left.click( + undo_kps, [target_im_raw, target_kp_got, gr.State("left")], [target_manual_kp_left, target_kp_got] ) - fix_vis_mask32.change( - enable_component, [fix_vis_mask32, fix_vis_mask256], fix_run + target_manual_reset_left.click( + reset_kps, [target_im_raw, target_kp_got, gr.State("left")], [target_manual_kp_left, target_kp_got] ) - fix_ready.click( - ready_sample, - [fix_ref, fix_inpaint_mask, fix_kpts], - [ - fix_ref_cond, - fix_target_cond, - fix_latent, - fix_inpaint_latent, - fix_kpts_np, - fix_vis_mask32, - fix_vis_mask256, - ], + target_manual_done.click(visible_component, [gr.State(0), target_manual_pose], target_manual_pose) + target_manual_done.click(visible_component, [gr.State(0), target_use_manual], target_use_manual) + target_manual_done.click(get_target_anno, [target_im_raw, target_kp_got], [target_img, target_manual_pose, target_manual_cond, target_manual_keypts]) + target_manual_pose.change(enable_component, [target_manual_pose, target_manual_pose], target_manual_done) + target_manual_pose.change(enable_component, [target_img, target_manual_pose], target_flip) + target_manual_cond.change(lambda x: x, target_manual_cond, target_cond) + target_manual_keypts.change(lambda x: x, target_manual_keypts, target_keypts) + target_use_manual.click(lambda x: x, target_manual_cond, target_cond) + target_use_manual.click(lambda x: x, target_manual_keypts, target_keypts) + target_use_manual.click(lambda x: gr.Info("Manual hand keypoints will be used for 'Reference'", duration=3)) + + target_flip.select( + flip_hand, + [target, target_im_raw, target_pose, target_manual_pose, target_manual_kp_right, target_manual_kp_left, target_cond, target_auto_cond, target_manual_cond, target_keypts, target_auto_keypts, target_manual_keypts], + [target, target_im_raw, target_pose, target_manual_pose, target_manual_kp_right, target_manual_kp_left, target_cond, target_auto_cond, target_manual_cond, target_keypts, target_auto_keypts, target_manual_keypts], ) - fix_run.click( - sample_inpaint, - [ - fix_ref_cond, - fix_target_cond, - fix_latent, - fix_inpaint_latent, - fix_kpts_np, - fix_original, - fix_crop_coord, - fix_n_generation, - fix_seed, - fix_cfg, - fix_quality, - ], - [fix_result, fix_result_pose, fix_result_original], + + # run listerners + ref_cond.change(enable_component, [ref_cond, target_cond], run) + target_cond.change(enable_component, [ref_cond, target_cond], run) + # ref_manual_pose.change(enable_component, [ref_manual_pose, target_manual_pose], run) + # target_manual_pose.change(enable_component, [ref_manual_pose, target_manual_pose], run) + run.click( + sample_diff, + [ref_cond, target_cond, target_keypts, n_generation, seed, cfg], + [results, results_pose], ) - fix_clear.click( - fix_clear_all, + clear.click( + clear_all, [], [ - fix_crop, - fix_crop_coord, - fix_ref, - fix_checkbox, - fix_kp_right, - fix_kp_left, - fix_result, - fix_result_pose, - fix_result_original, - fix_inpaint_mask, - fix_original, - fix_img, - fix_vis_mask32, - fix_vis_mask256, - fix_kpts, - fix_kpts_np, - fix_ref_cond, - fix_target_cond, - fix_latent, - fix_inpaint_latent, - fix_n_generation, - fix_seed, - fix_cfg, - fix_quality, + ref, + ref_manual_checkbox, + ref_manual_kp_right, + ref_manual_kp_left, + ref_img, + ref_pose, + ref_manual_pose, + ref_cond, + ref_flip, + target, + target_keypts, + target_manual_checkbox, + target_manual_kp_right, + target_manual_kp_left, + target_img, + target_pose, + target_manual_pose, + target_cond, + target_flip, + results, + results_pose, + n_generation, + seed, + cfg, + ref_kp_raw, ], ) - fix_clear.click( - fix_set_unvisible, + clear.click( + set_unvisible, [], [ - fix_kp_right, - fix_kp_left, - fix_kp_r_info, - fix_kp_l_info, - fix_undo_left, - fix_undo_right, - fix_reset_left, - fix_reset_right + ref_manual_kp_l_info, + ref_manual_kp_r_info, + ref_manual_kp_left, + ref_manual_kp_right, + ref_manual_undo_left, + ref_manual_undo_right, + ref_manual_reset_left, + ref_manual_reset_right, + ref_manual_done, + ref_manual_done_info, + ref_manual_pose, + ref_use_manual, + target_manual_kp_l_info, + target_manual_kp_r_info, + target_manual_kp_left, + target_manual_kp_right, + target_manual_undo_left, + target_manual_undo_right, + target_manual_reset_left, + target_manual_reset_right, + target_manual_done, + target_manual_done_info, + target_manual_pose, + target_use_manual, ] ) + gr.Markdown("

Acknowledgement

") + gr.Markdown(_ACK_) + gr.Markdown("

Trouble Shooting

") + gr.Markdown("If something doesn't work,
1. Try refreshing the page and do it again.
2. Leave a message at our HuggingFace Spaces's \"Community\" tab on the top right, at our Github repo's Issues, or email us.
3. The problem might be from compatibility issue to HuggingFace or GPU vram limitations. If that's possible, we highly recommend you to clone this repo and try with your own gpu.") gr.Markdown("

Citation

") gr.Markdown( """

If this was useful, please cite us! ❀️

"""