Spaces:

WensongSong
/

Insert-Anything

Running on Zero

App Files Files Community

WensongSong commited on 10 days ago

Commit

50d0879

verified ·

1 Parent(s): cb27c42

Update app.py

Browse files

Files changed (1) hide show

app.py +181 -34

app.py CHANGED Viewed

@@ -9,7 +9,139 @@ from huggingface_hub import snapshot_download
 from diffusers import FluxFillPipeline, FluxPriorReduxPipeline
 import math
 from utils.utils import get_bbox_from_mask, expand_bbox, pad_to_square, box2squre, crop_back, expand_image_mask
 import spaces
 hf_token = os.getenv("HF_TOKEN")
@@ -59,26 +191,31 @@ image_mask_list.sort()
 @spaces.GPU
 def run_local(base_image, base_mask, reference_image, ref_mask, seed, base_mask_option, ref_mask_option):
     if base_mask_option == "Draw Mask":
-        tar_image = base_image["image"]
-        tar_mask = base_image["mask"]
     else:
-        tar_image = base_image["image"]
-        tar_mask = base_mask
     if ref_mask_option == "Draw Mask":
-        ref_image = reference_image["image"]
-        ref_mask = reference_image["mask"]
     else:
-        ref_image = reference_image["image"]
-        ref_mask = ref_mask
     tar_image = tar_image.convert("RGB")
     tar_mask = tar_mask.convert("L")
     ref_image = ref_image.convert("RGB")
     ref_mask = ref_mask.convert("L")
     tar_image = np.asarray(tar_image)
     tar_mask = np.asarray(tar_mask)
     tar_mask = np.where(tar_mask > 128, 1, 0).astype(np.uint8)
@@ -87,15 +224,20 @@ def run_local(base_image, base_mask, reference_image, ref_mask, seed, base_mask_
     ref_mask = np.asarray(ref_mask)
     ref_mask = np.where(ref_mask > 128, 1, 0).astype(np.uint8)
     ref_box_yyxx = get_bbox_from_mask(ref_mask)
     ref_mask_3 = np.stack([ref_mask,ref_mask,ref_mask],-1)
     masked_ref_image = ref_image * ref_mask_3 + np.ones_like(ref_image) * 255 * (1-ref_mask_3)
     y1,y2,x1,x2 = ref_box_yyxx
-    masked_ref_image = masked_ref_image[y1:y2,x1:x2,:]
     ref_mask = ref_mask[y1:y2,x1:x2]
     ratio = 1.3
-    masked_ref_image, ref_mask = expand_image_mask(masked_ref_image, ref_mask, ratio=ratio)
     masked_ref_image = pad_to_square(masked_ref_image, pad_value = 255, random = False)
@@ -172,8 +314,10 @@ def run_local(base_image, base_mask, reference_image, ref_mask, seed, base_mask_
     edited_image = crop_back(edited_image, old_tar_image, np.array([H1, W1, H2, W2]), np.array(tar_box_yyxx_crop))
     edited_image = Image.fromarray(edited_image)
-    return [edited_image]
 def update_ui(option):
     if option == "Draw Mask":
@@ -185,32 +329,37 @@ def update_ui(option):
 with gr.Blocks() as demo:
-    gr.Markdown("#  Play with InsertAnything to Insert your Target Objects! ")
-    gr.Markdown("# Upload / Draw Images for the Background (up) and Reference Object (down)")
-    gr.Markdown("### Draw mask on the background or just upload the mask.")
-    gr.Markdown("### Only select one of these two methods. Don't forget to click the corresponding button!!")
     with gr.Row():
-        with gr.Column():
             with gr.Row():
-                base_image = gr.Image(label="Background Image", source="upload", tool="sketch", type="pil",
-                                        brush_color='#FFFFFF', mask_opacity=0.5)
-                base_mask = gr.Image(label="Background Mask", source="upload", type="pil")
             with gr.Row():
                 base_mask_option = gr.Radio(["Draw Mask", "Upload with Mask"], label="Background Mask Input Option", value="Upload with Mask")
             with gr.Row():
-                ref_image = gr.Image(label="Reference Image", source="upload", tool="sketch", type="pil",
-                                    brush_color='#FFFFFF', mask_opacity=0.5)
-                ref_mask = gr.Image(label="Reference Mask", source="upload", type="pil")
             with gr.Row():
-                ref_mask_option = gr.Radio(["Draw Mask", "Upload with Mask"], label="Reference Mask Input Option", value="Upload with Mask")
-            baseline_gallery = gr.Gallery(label='Output', show_label=True, elem_id="gallery", height=512, columns=1)
             with gr.Accordion("Advanced Option", open=True):
                 seed = gr.Slider(label="Seed", minimum=-1, maximum=999999999, step=1, value=666)
                 gr.Markdown("### Guidelines")
@@ -218,7 +367,6 @@ with gr.Blocks() as demo:
     run_local_button = gr.Button(value="Run")
     # #### example #####
     num_examples = len(image_list)
     for i in range(num_examples):
@@ -234,12 +382,11 @@ with gr.Blocks() as demo:
                 gr.Examples([ref_list[i]], inputs=[ref_image], examples_per_page=1, label="")
                 gr.Examples([ref_mask_list[i]], inputs=[ref_mask], examples_per_page=1, label="")
         if i < num_examples - 1:
-            with gr.Row():
-                gr.HTML("<hr>")
     # #### example #####
-    run_local_button.click(fn=run_local,
-                           inputs=[base_image, base_mask, ref_image, ref_mask, seed, base_mask_option, ref_mask_option],
-                           outputs=[baseline_gallery]
-                        )
 demo.launch()

 from diffusers import FluxFillPipeline, FluxPriorReduxPipeline
 import math
 from utils.utils import get_bbox_from_mask, expand_bbox, pad_to_square, box2squre, crop_back, expand_image_mask
+import os,sys
+os.system("python -m pip install -e segment_anything")
+os.system("python -m pip install -e GroundingDINO")
+sys.path.append(os.path.join(os.getcwd(), "GroundingDINO"))
+sys.path.append(os.path.join(os.getcwd(), "segment_anything"))
+os.system("wget https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swinb_cogcoor.pth")
+os.system("wget https://huggingface.co/spaces/mrtlive/segment-anything-model/resolve/main/sam_vit_h_4b8939.pth")
+import torchvision
+from GroundingDINO.groundingdino.util.inference import load_model
+from segment_anything import build_sam, SamPredictor
 import spaces
+import GroundingDINO.groundingdino.datasets.transforms as T
+from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
+# GroundingDINO config and checkpoint
+GROUNDING_DINO_CONFIG_PATH = "./GroundingDINO_SwinB.cfg.py"
+GROUNDING_DINO_CHECKPOINT_PATH = "./groundingdino_swinb_cogcoor.pth"
+# Segment-Anything checkpoint
+SAM_ENCODER_VERSION = "vit_h"
+SAM_CHECKPOINT_PATH = "./sam_vit_h_4b8939.pth"
+# Building GroundingDINO inference model
+groundingdino_model  = load_model(model_config_path=GROUNDING_DINO_CONFIG_PATH, model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH, device="cpu")
+# Building SAM Model and SAM Predictor
+sam = build_sam(checkpoint=SAM_CHECKPOINT_PATH)
+sam_predictor = SamPredictor(sam)
+def transform_image(image_pil):
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3, h, w
+    return image
+def get_grounding_output(model, image, caption, box_threshold=0.25, text_threshold=0.25, with_logits=True):
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    with torch.no_grad():
+        outputs = model(image[None], captions=[caption])
+    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
+    logits.shape[0]
+    # filter output
+    logits_filt = logits.clone()
+    boxes_filt = boxes.clone()
+    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+    logits_filt = logits_filt[filt_mask]  # num_filt, 256
+    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+    logits_filt.shape[0]
+    # get phrase
+    tokenlizer = model.tokenizer
+    tokenized = tokenlizer(caption)
+    # build pred
+    pred_phrases = []
+    scores = []
+    for logit, box in zip(logits_filt, boxes_filt):
+        pred_phrase = get_phrases_from_posmap(
+            logit > text_threshold, tokenized, tokenlizer)
+        if with_logits:
+            pred_phrases.append(
+                pred_phrase + f"({str(logit.max().item())[:4]})")
+        else:
+            pred_phrases.append(pred_phrase)
+        scores.append(logit.max().item())
+    return boxes_filt, torch.Tensor(scores), pred_phrases
+def get_mask(image, label):
+    global groundingdino_model, sam_predictor
+    image_pil = image.convert("RGB")
+    transformed_image = transform_image(image_pil)
+    boxes_filt, scores, pred_phrases = get_grounding_output(
+        groundingdino_model, transformed_image, label
+    )
+    size = image_pil.size
+    # process boxes
+    H, W = size[1], size[0]
+    for i in range(boxes_filt.size(0)):
+        boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
+        boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
+        boxes_filt[i][2:] += boxes_filt[i][:2]
+    boxes_filt = boxes_filt.cpu()
+    # nms
+    nms_idx = torchvision.ops.nms(
+        boxes_filt, scores, 0.8).numpy().tolist()
+    boxes_filt = boxes_filt[nms_idx]
+    pred_phrases = [pred_phrases[idx] for idx in nms_idx]
+    image = np.array(image_pil)
+    sam_predictor.set_image(image)
+    transformed_boxes = sam_predictor.transform.apply_boxes_torch(
+        boxes_filt, image.shape[:2])
+    masks, _, _ = sam_predictor.predict_torch(
+        point_coords=None,
+        point_labels=None,
+        boxes=transformed_boxes,
+        multimask_output=False,
+    )
+    result_mask = masks[0][0].cpu().numpy()
+    result_mask = Image.fromarray(result_mask)
+    return result_mask
 hf_token = os.getenv("HF_TOKEN")
 @spaces.GPU
 def run_local(base_image, base_mask, reference_image, ref_mask, seed, base_mask_option, ref_mask_option):
     if base_mask_option == "Draw Mask":
+        tar_image = base_image["background"]
+        tar_mask = base_image["layers"][0]
     else:
+        tar_image = base_image["background"]
+        tar_mask = base_mask["background"]
     if ref_mask_option == "Draw Mask":
+        ref_image = reference_image["background"]
+        ref_mask = reference_image["layers"][0]
+    elif ref_mask_option == "Upload with Mask":
+        ref_image = reference_image["background"]
+        ref_mask = ref_mask["background"]
     else:
+        ref_image = reference_image["background"]
+        ref_mask = get_mask(ref_image, text_prompt)
     tar_image = tar_image.convert("RGB")
     tar_mask = tar_mask.convert("L")
     ref_image = ref_image.convert("RGB")
     ref_mask = ref_mask.convert("L")
+    return_ref_mask = ref_mask.copy()
     tar_image = np.asarray(tar_image)
     tar_mask = np.asarray(tar_mask)
     tar_mask = np.where(tar_mask > 128, 1, 0).astype(np.uint8)
     ref_mask = np.asarray(ref_mask)
     ref_mask = np.where(ref_mask > 128, 1, 0).astype(np.uint8)
+    if tar_mask.sum() == 0:
+        raise gr.Error('No mask for the background image.Please check mask button!')
+    if ref_mask.sum() == 0:
+        raise gr.Error('No mask for the reference image.Please check mask button!')
     ref_box_yyxx = get_bbox_from_mask(ref_mask)
     ref_mask_3 = np.stack([ref_mask,ref_mask,ref_mask],-1)
     masked_ref_image = ref_image * ref_mask_3 + np.ones_like(ref_image) * 255 * (1-ref_mask_3)
     y1,y2,x1,x2 = ref_box_yyxx
+    masked_ref_image = masked_ref_image[y1:y2,x1:x2,:]
     ref_mask = ref_mask[y1:y2,x1:x2]
     ratio = 1.3
+    masked_ref_image, ref_mask = expand_image_mask(masked_ref_image, ref_mask, ratio=ratio)
     masked_ref_image = pad_to_square(masked_ref_image, pad_value = 255, random = False)
     edited_image = crop_back(edited_image, old_tar_image, np.array([H1, W1, H2, W2]), np.array(tar_box_yyxx_crop))
     edited_image = Image.fromarray(edited_image)
+    if ref_mask_option != "Label to Mask":
+        return [edited_image]
+    else:
+        return [return_ref_mask, edited_image]
 def update_ui(option):
     if option == "Draw Mask":
 with gr.Blocks() as demo:
+    gr.Markdown("# Insert-Anything")
+    gr.Markdown("### Draw mask or upload mask.Only select one of these two methods. Don't forget to click the corresponding button!!")
     with gr.Row():
+        with gr.Column(scale=1):
             with gr.Row():
+                base_image = gr.ImageEditor(label="Background Image", sources="upload", type="pil", brush=gr.Brush(colors=["#FFFFFF"],default_size = 30,color_mode = "fixed"),
+                                    layers = False,
+                                    interactive=True)
+                base_mask = gr.ImageEditor(label="Background Mask", sources="upload", type="pil", layers = False, brush=False, eraser=False)
             with gr.Row():
                 base_mask_option = gr.Radio(["Draw Mask", "Upload with Mask"], label="Background Mask Input Option", value="Upload with Mask")
             with gr.Row():
+                ref_image = gr.ImageEditor(label="Reference Image", sources="upload", type="pil", brush=gr.Brush(colors=["#FFFFFF"],default_size = 30,color_mode = "fixed"),
+                                    layers = False,
+                                    interactive=True)
+                ref_mask = gr.ImageEditor(label="Reference Mask", sources="upload", type="pil", layers = False, brush=False, eraser=False)
             with gr.Row():
+                ref_mask_option = gr.Radio(["Draw Mask", "Upload with Mask", "Label to Mask"], label="Reference Mask Input Option", value="Upload with Mask")
+            with gr.Row():
+                text_prompt = gr.Textbox(label="Label")
+        with gr.Column(scale=1):
+            baseline_gallery = gr.Gallery(label='Output', show_label=True, elem_id="gallery", height=701, columns=1)
             with gr.Accordion("Advanced Option", open=True):
                 seed = gr.Slider(label="Seed", minimum=-1, maximum=999999999, step=1, value=666)
                 gr.Markdown("### Guidelines")
     run_local_button = gr.Button(value="Run")
     # #### example #####
     num_examples = len(image_list)
     for i in range(num_examples):
                 gr.Examples([ref_list[i]], inputs=[ref_image], examples_per_page=1, label="")
                 gr.Examples([ref_mask_list[i]], inputs=[ref_mask], examples_per_page=1, label="")
         if i < num_examples - 1:
+            gr.HTML("<hr>")
     # #### example #####
+    run_local_button.click(fn=run_local,
+                            inputs=[base_image, base_mask, ref_image, ref_mask, seed, base_mask_option, ref_mask_option, text_prompt],
+                            outputs=[baseline_gallery]
+                            )
 demo.launch()