Spaces:

WensongSong
/

Insert-Anything

Running on Zero

App Files Files Community

Insert-Anything / app.py

WensongSong

Update app.py

72509e7 verified 3 days ago

raw

history blame contribute delete

14.3 kB

	import os
	import sys
	import cv2
	import numpy as np
	import torch
	import gradio as gr
	from PIL import Image, ImageFilter, ImageDraw
	from huggingface_hub import snapshot_download
	from diffusers import FluxFillPipeline, FluxPriorReduxPipeline
	import math
	from utils.utils import get_bbox_from_mask, expand_bbox, pad_to_square, box2squre, crop_back, expand_image_mask

	import os,sys
	os.system("python -m pip install -e segment_anything")
	os.system("python -m pip install -e GroundingDINO")
	sys.path.append(os.path.join(os.getcwd(), "GroundingDINO"))
	sys.path.append(os.path.join(os.getcwd(), "segment_anything"))
	os.system("wget https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swinb_cogcoor.pth")
	os.system("wget https://huggingface.co/spaces/mrtlive/segment-anything-model/resolve/main/sam_vit_h_4b8939.pth")

	import torchvision
	from GroundingDINO.groundingdino.util.inference import load_model
	from segment_anything import build_sam, SamPredictor
	import spaces
	import GroundingDINO.groundingdino.datasets.transforms as T
	from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap



	# GroundingDINO config and checkpoint
	GROUNDING_DINO_CONFIG_PATH = "./GroundingDINO_SwinB.cfg.py"
	GROUNDING_DINO_CHECKPOINT_PATH = "./groundingdino_swinb_cogcoor.pth"

	# Segment-Anything checkpoint
	SAM_ENCODER_VERSION = "vit_h"
	SAM_CHECKPOINT_PATH = "./sam_vit_h_4b8939.pth"

	# Building GroundingDINO inference model
	groundingdino_model = load_model(model_config_path=GROUNDING_DINO_CONFIG_PATH, model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH, device="cuda")
	# Building SAM Model and SAM Predictor
	sam = build_sam(checkpoint=SAM_CHECKPOINT_PATH)
	sam.to(device="cuda")
	sam_predictor = SamPredictor(sam)

	def transform_image(image_pil):

	transform = T.Compose(
	[
	T.RandomResize([800], max_size=1333),
	T.ToTensor(),
	T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
	]
	)
	image, _ = transform(image_pil, None) # 3, h, w
	return image


	def get_grounding_output(model, image, caption, box_threshold=0.25, text_threshold=0.25, with_logits=True):
	caption = caption.lower()
	caption = caption.strip()
	if not caption.endswith("."):
	caption = caption + "."

	with torch.no_grad():
	outputs = model(image[None], captions=[caption])
	logits = outputs["pred_logits"].cpu().sigmoid()[0] # (nq, 256)
	boxes = outputs["pred_boxes"].cpu()[0] # (nq, 4)
	logits.shape[0]

	# filter output
	logits_filt = logits.clone()
	boxes_filt = boxes.clone()
	filt_mask = logits_filt.max(dim=1)[0] > box_threshold
	logits_filt = logits_filt[filt_mask] # num_filt, 256
	boxes_filt = boxes_filt[filt_mask] # num_filt, 4
	logits_filt.shape[0]

	# get phrase
	tokenlizer = model.tokenizer
	tokenized = tokenlizer(caption)
	# build pred
	pred_phrases = []
	scores = []
	for logit, box in zip(logits_filt, boxes_filt):
	pred_phrase = get_phrases_from_posmap(
	logit > text_threshold, tokenized, tokenlizer)
	if with_logits:
	pred_phrases.append(
	pred_phrase + f"({str(logit.max().item())[:4]})")
	else:
	pred_phrases.append(pred_phrase)
	scores.append(logit.max().item())

	return boxes_filt, torch.Tensor(scores), pred_phrases


	def get_mask(image, label):
	global groundingdino_model, sam_predictor


	image_pil = image.convert("RGB")
	transformed_image = transform_image(image_pil)


	boxes_filt, scores, pred_phrases = get_grounding_output(
	groundingdino_model, transformed_image, label
	)

	size = image_pil.size

	# process boxes
	H, W = size[1], size[0]
	for i in range(boxes_filt.size(0)):
	boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
	boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
	boxes_filt[i][2:] += boxes_filt[i][:2]

	boxes_filt = boxes_filt.cpu()

	# nms

	nms_idx = torchvision.ops.nms(
	boxes_filt, scores, 0.8).numpy().tolist()
	boxes_filt = boxes_filt[nms_idx]
	pred_phrases = [pred_phrases[idx] for idx in nms_idx]


	image = np.array(image_pil)
	sam_predictor.set_image(image)

	transformed_boxes = sam_predictor.transform.apply_boxes_torch(
	boxes_filt, image.shape[:2]).to("cuda")

	masks, _, _ = sam_predictor.predict_torch(
	point_coords=None,
	point_labels=None,
	boxes=transformed_boxes,
	multimask_output=False,
	)
	result_mask = masks[0][0].cpu().numpy()

	result_mask = Image.fromarray(result_mask)

	return result_mask


	hf_token = os.getenv("HF_TOKEN")

	snapshot_download(repo_id="black-forest-labs/FLUX.1-Fill-dev", local_dir="./FLUX.1-Fill-dev", token=hf_token)
	snapshot_download(repo_id="black-forest-labs/FLUX.1-Redux-dev", local_dir="./FLUX.1-Redux-dev", token=hf_token)
	snapshot_download(repo_id="WensongSong/Insert-Anything", local_dir="./insertanything_model", token=hf_token)


	dtype = torch.bfloat16
	size = (768, 768)

	pipe = FluxFillPipeline.from_pretrained(
	"./FLUX.1-Fill-dev",
	torch_dtype=dtype
	).to("cuda")

	pipe.load_lora_weights(
	"./insertanything_model/20250321_steps5000_pytorch_lora_weights.safetensors"
	)


	redux = FluxPriorReduxPipeline.from_pretrained("./FLUX.1-Redux-dev").to(dtype=dtype).to("cuda")



	### example #####
	ref_dir='./examples/ref_image'
	ref_mask_dir='./examples/ref_mask'
	image_dir='./examples/source_image'
	image_mask_dir='./examples/source_mask'

	ref_list=[os.path.join(ref_dir,file) for file in os.listdir(ref_dir) if '.jpg' in file or '.png' in file or '.jpeg' in file ]
	ref_list.sort()

	ref_mask_list=[os.path.join(ref_mask_dir,file) for file in os.listdir(ref_mask_dir) if '.jpg' in file or '.png' in file or '.jpeg' in file]
	ref_mask_list.sort()

	image_list=[os.path.join(image_dir,file) for file in os.listdir(image_dir) if '.jpg' in file or '.png' in file or '.jpeg' in file ]
	image_list.sort()

	image_mask_list=[os.path.join(image_mask_dir,file) for file in os.listdir(image_mask_dir) if '.jpg' in file or '.png' in file or '.jpeg' in file]
	image_mask_list.sort()
	### example #####



	@spaces.GPU
	def run_local(base_image, base_mask, reference_image, ref_mask, seed, base_mask_option, ref_mask_option, text_prompt):


	if base_mask_option == "Draw Mask":
	tar_image = base_image["background"]
	tar_mask = base_image["layers"][0]
	else:
	tar_image = base_image["background"]
	tar_mask = base_mask["background"]

	if ref_mask_option == "Draw Mask":
	ref_image = reference_image["background"]
	ref_mask = reference_image["layers"][0]
	elif ref_mask_option == "Upload with Mask":
	ref_image = reference_image["background"]
	ref_mask = ref_mask["background"]
	else:
	ref_image = reference_image["background"]
	ref_mask = get_mask(ref_image, text_prompt)

	tar_image = tar_image.convert("RGB")
	tar_mask = tar_mask.convert("L")
	ref_image = ref_image.convert("RGB")
	ref_mask = ref_mask.convert("L")

	return_ref_mask = ref_mask.copy()

	tar_image = np.asarray(tar_image)
	tar_mask = np.asarray(tar_mask)
	tar_mask = np.where(tar_mask > 128, 1, 0).astype(np.uint8)

	ref_image = np.asarray(ref_image)
	ref_mask = np.asarray(ref_mask)
	ref_mask = np.where(ref_mask > 128, 1, 0).astype(np.uint8)

	if tar_mask.sum() == 0:
	raise gr.Error('No mask for the background image.Please check mask button!')

	if ref_mask.sum() == 0:
	raise gr.Error('No mask for the reference image.Please check mask button!')

	ref_box_yyxx = get_bbox_from_mask(ref_mask)
	ref_mask_3 = np.stack([ref_mask,ref_mask,ref_mask],-1)
	masked_ref_image = ref_image * ref_mask_3 + np.ones_like(ref_image) * 255 * (1-ref_mask_3)
	y1,y2,x1,x2 = ref_box_yyxx
	masked_ref_image = masked_ref_image[y1:y2,x1:x2,:]
	ref_mask = ref_mask[y1:y2,x1:x2]
	ratio = 1.3
	masked_ref_image, ref_mask = expand_image_mask(masked_ref_image, ref_mask, ratio=ratio)


	masked_ref_image = pad_to_square(masked_ref_image, pad_value = 255, random = False)

	kernel = np.ones((7, 7), np.uint8)
	iterations = 2
	tar_mask = cv2.dilate(tar_mask, kernel, iterations=iterations)

	# zome in
	tar_box_yyxx = get_bbox_from_mask(tar_mask)
	tar_box_yyxx = expand_bbox(tar_mask, tar_box_yyxx, ratio=1.2)

	tar_box_yyxx_crop = expand_bbox(tar_image, tar_box_yyxx, ratio=2) #1.2 1.6
	tar_box_yyxx_crop = box2squre(tar_image, tar_box_yyxx_crop) # crop box
	y1,y2,x1,x2 = tar_box_yyxx_crop


	old_tar_image = tar_image.copy()
	tar_image = tar_image[y1:y2,x1:x2,:]
	tar_mask = tar_mask[y1:y2,x1:x2]

	H1, W1 = tar_image.shape[0], tar_image.shape[1]
	# zome in


	tar_mask = pad_to_square(tar_mask, pad_value=0)
	tar_mask = cv2.resize(tar_mask, size)

	masked_ref_image = cv2.resize(masked_ref_image.astype(np.uint8), size).astype(np.uint8)
	pipe_prior_output = redux(Image.fromarray(masked_ref_image))


	tar_image = pad_to_square(tar_image, pad_value=255)

	H2, W2 = tar_image.shape[0], tar_image.shape[1]

	tar_image = cv2.resize(tar_image, size)
	diptych_ref_tar = np.concatenate([masked_ref_image, tar_image], axis=1)


	tar_mask = np.stack([tar_mask,tar_mask,tar_mask],-1)
	mask_black = np.ones_like(tar_image) * 0
	mask_diptych = np.concatenate([mask_black, tar_mask], axis=1)


	diptych_ref_tar = Image.fromarray(diptych_ref_tar)
	mask_diptych[mask_diptych == 1] = 255
	mask_diptych = Image.fromarray(mask_diptych)



	generator = torch.Generator("cuda").manual_seed(seed)
	edited_image = pipe(
	image=diptych_ref_tar,
	mask_image=mask_diptych,
	height=mask_diptych.size[1],
	width=mask_diptych.size[0],
	max_sequence_length=512,
	generator=generator,
	**pipe_prior_output,
	).images[0]



	width, height = edited_image.size
	left = width // 2
	right = width
	top = 0
	bottom = height
	edited_image = edited_image.crop((left, top, right, bottom))


	edited_image = np.array(edited_image)
	edited_image = crop_back(edited_image, old_tar_image, np.array([H1, W1, H2, W2]), np.array(tar_box_yyxx_crop))
	edited_image = Image.fromarray(edited_image)

	if ref_mask_option != "Label to Mask":
	return [edited_image]
	else:
	return [return_ref_mask, edited_image]

	def update_ui(option):
	if option == "Draw Mask":
	return gr.update(visible=False), gr.update(visible=True)
	else:
	return gr.update(visible=True), gr.update(visible=False)


	with gr.Blocks() as demo:


	gr.Markdown("# Insert-Anything")
	gr.Markdown("### Make sure to select the correct mask button!!")


	with gr.Row():
	with gr.Column(scale=1):
	with gr.Row():
	base_image = gr.ImageEditor(label="Background Image", sources="upload", type="pil", brush=gr.Brush(colors=["#FFFFFF"],default_size = 30,color_mode = "fixed"),
	layers = False,
	interactive=True)

	base_mask = gr.ImageEditor(label="Background Mask", sources="upload", type="pil", layers = False, brush=False, eraser=False)

	with gr.Row():
	base_mask_option = gr.Radio(["Draw Mask", "Upload with Mask"], label="Background Mask Input Option", value="Upload with Mask")

	with gr.Row():
	ref_image = gr.ImageEditor(label="Reference Image", sources="upload", type="pil", brush=gr.Brush(colors=["#FFFFFF"],default_size = 30,color_mode = "fixed"),
	layers = False,
	interactive=True)

	ref_mask = gr.ImageEditor(label="Reference Mask", sources="upload", type="pil", layers = False, brush=False, eraser=False)

	with gr.Row():
	ref_mask_option = gr.Radio(["Draw Mask", "Upload with Mask", "Label to Mask"], label="Reference Mask Input Option", value="Upload with Mask")

	with gr.Row():
	text_prompt = gr.Textbox(label="Label")

	with gr.Column(scale=1):
	baseline_gallery = gr.Gallery(label='Output', show_label=True, elem_id="gallery", height=765, columns=1)
	with gr.Accordion("Advanced Option", open=True):
	seed = gr.Slider(label="Seed", minimum=-1, maximum=999999999, step=1, value=666)
	gr.Markdown("### Guidelines")
	gr.Markdown(" Users can try using different seeds. For example, seeds like 42 and 123456 may produce different effects.")
	gr.Markdown(" Label to Mask means generating a mask by simply inputting a label.")

	run_local_button = gr.Button(value="Run")

	# #### example #####
	num_examples = len(image_list)
	for i in range(num_examples):
	with gr.Row():
	if i == 0:
	gr.Examples([image_list[i]], inputs=[base_image], label="Examples - Background Image", examples_per_page=1)
	gr.Examples([image_mask_list[i]], inputs=[base_mask], label="Examples - Background Mask", examples_per_page=1)
	gr.Examples([ref_list[i]], inputs=[ref_image], label="Examples - Reference Object", examples_per_page=1)
	gr.Examples([ref_mask_list[i]], inputs=[ref_mask], label="Examples - Reference Mask", examples_per_page=1)
	else:
	gr.Examples([image_list[i]], inputs=[base_image], examples_per_page=1, label="")
	gr.Examples([image_mask_list[i]], inputs=[base_mask], examples_per_page=1, label="")
	gr.Examples([ref_list[i]], inputs=[ref_image], examples_per_page=1, label="")
	gr.Examples([ref_mask_list[i]], inputs=[ref_mask], examples_per_page=1, label="")
	if i < num_examples - 1:
	gr.HTML("<hr>")
	# #### example #####

	run_local_button.click(fn=run_local,
	inputs=[base_image, base_mask, ref_image, ref_mask, seed, base_mask_option, ref_mask_option, text_prompt],
	outputs=[baseline_gallery]
	)
	demo.launch()