Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,069 Bytes
b4092c0 896aaf0 b4092c0 896aaf0 b4092c0 896aaf0 b4092c0 896aaf0 b4092c0 896aaf0 b4092c0 896aaf0 b4092c0 896aaf0 b4092c0 a4222bb 896aaf0 b4092c0 896aaf0 b4092c0 896aaf0 b4092c0 896aaf0 b4092c0 896aaf0 b4092c0 896aaf0 b4092c0 896aaf0 b4092c0 896aaf0 b4092c0 896aaf0 b4092c0 896aaf0 b4092c0 8ac1bff b4092c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import os,sys
os.system("python -m pip install -e segment_anything")
os.system("python -m pip install -e GroundingDINO")
sys.path.append(os.path.join(os.getcwd(), "GroundingDINO"))
sys.path.append(os.path.join(os.getcwd(), "segment_anything"))
os.system("wget https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swinb_cogcoor.pth")
os.system("wget https://huggingface.co/spaces/mrtlive/segment-anything-model/resolve/main/sam_vit_h_4b8939.pth")
import cv2
import numpy as np
import torch
import torchvision
import gradio as gr
from PIL import Image
from GroundingDINO.groundingdino.util.inference import load_model
from segment_anything import build_sam, SamPredictor
import spaces
import GroundingDINO.groundingdino.datasets.transforms as T
from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# GroundingDINO config and checkpoint
GROUNDING_DINO_CONFIG_PATH = "./GroundingDINO_SwinB.cfg.py"
GROUNDING_DINO_CHECKPOINT_PATH = "./groundingdino_swinb_cogcoor.pth"
# Segment-Anything checkpoint
SAM_ENCODER_VERSION = "vit_h"
SAM_CHECKPOINT_PATH = "./sam_vit_h_4b8939.pth"
# Building GroundingDINO inference model
groundingdino_model = load_model(model_config_path=GROUNDING_DINO_CONFIG_PATH, model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH, device=DEVICE)
# Building SAM Model and SAM Predictor
sam = build_sam(checkpoint=SAM_CHECKPOINT_PATH)
sam.to(device=DEVICE)
sam_predictor = SamPredictor(sam)
def transform_image(image_pil):
transform = T.Compose(
[
T.RandomResize([800], max_size=1333),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
image, _ = transform(image_pil, None) # 3, h, w
return image
def get_grounding_output(model, image, caption, box_threshold=0.25, text_threshold=0.25, with_logits=True):
caption = caption.lower()
caption = caption.strip()
if not caption.endswith("."):
caption = caption + "."
with torch.no_grad():
outputs = model(image[None], captions=[caption])
logits = outputs["pred_logits"].cpu().sigmoid()[0] # (nq, 256)
boxes = outputs["pred_boxes"].cpu()[0] # (nq, 4)
logits.shape[0]
# filter output
logits_filt = logits.clone()
boxes_filt = boxes.clone()
filt_mask = logits_filt.max(dim=1)[0] > box_threshold
logits_filt = logits_filt[filt_mask] # num_filt, 256
boxes_filt = boxes_filt[filt_mask] # num_filt, 4
logits_filt.shape[0]
# get phrase
tokenlizer = model.tokenizer
tokenized = tokenlizer(caption)
# build pred
pred_phrases = []
scores = []
for logit, box in zip(logits_filt, boxes_filt):
pred_phrase = get_phrases_from_posmap(
logit > text_threshold, tokenized, tokenlizer)
if with_logits:
pred_phrases.append(
pred_phrase + f"({str(logit.max().item())[:4]})")
else:
pred_phrases.append(pred_phrase)
scores.append(logit.max().item())
return boxes_filt, torch.Tensor(scores), pred_phrases
@spaces.GPU
def run_local(image, label):
global groundingdino_model, sam_predictor
image_pil = image.convert("RGB")
transformed_image = transform_image(image_pil)
boxes_filt, scores, pred_phrases = get_grounding_output(
groundingdino_model, transformed_image, label
)
size = image_pil.size
# process boxes
H, W = size[1], size[0]
for i in range(boxes_filt.size(0)):
boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
boxes_filt[i][2:] += boxes_filt[i][:2]
boxes_filt = boxes_filt.cpu()
# nms
nms_idx = torchvision.ops.nms(
boxes_filt, scores, 0.8).numpy().tolist()
boxes_filt = boxes_filt[nms_idx]
pred_phrases = [pred_phrases[idx] for idx in nms_idx]
image = np.array(image_pil)
sam_predictor.set_image(image)
transformed_boxes = sam_predictor.transform.apply_boxes_torch(
boxes_filt, image.shape[:2]).to(DEVICE)
masks, _, _ = sam_predictor.predict_torch(
point_coords=None,
point_labels=None,
boxes=transformed_boxes,
multimask_output=False,
)
result_mask = masks[0][0].cpu().numpy()
result_mask = Image.fromarray(result_mask)
return [result_mask]
with gr.Blocks() as demo:
gr.Markdown("# Segment")
with gr.Row():
with gr.Column():
input_image = gr.Image(sources='upload', type="pil", height=512)
text_prompt = gr.Textbox(label="Label")
with gr.Column():
gallery = gr.Gallery(label="Generated images", show_label=False, elem_id="gallery", height=512)
run_local_button = gr.Button(value="Run")
run_local_button.click(fn=run_local,
inputs=[input_image, text_prompt],
outputs=[gallery]
)
demo.launch() |