Spaces:

sengourav012
/

virtual-try-on

Running

App Files Files Community

sengourav012 commited on 28 days ago

Commit

3e7b4c7

verified ·

1 Parent(s): ab2903b

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -24

app.py CHANGED Viewed

@@ -5,12 +5,13 @@ import torch.nn as nn
 import numpy as np
 from torchvision import transforms
 import cv2
 from transformers import AutoImageProcessor, SegformerForSemanticSegmentation
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# ----------------- Load Human Parser Model from Hugging Face Hub -----------------
 processor = AutoImageProcessor.from_pretrained("matei-dorian/segformer-b5-finetuned-human-parsing")
 parser_model = SegformerForSemanticSegmentation.from_pretrained(
     "matei-dorian/segformer-b5-finetuned-human-parsing"
@@ -60,17 +61,25 @@ class UNetGenerator(nn.Module):
         u4 = self.up4(torch.cat([u3, d1], dim=1))
         return u4
-# ----------------- Load UNet Try-On Model -----------------
-tryon_model = UNetGenerator().to(device)
-checkpoint = torch.load("viton_unet_full_checkpoint.pth", map_location=device)
-tryon_model.load_state_dict(checkpoint['model_state_dict'])
-tryon_model.eval()
 # ----------------- Image Transforms -----------------
-img_transform = transforms.Compose([
-    transforms.Resize((256, 192)),
-    transforms.ToTensor()
-])
 # ----------------- Helper Functions -----------------
 def get_segmentation(image: Image.Image):
@@ -85,39 +94,108 @@ def generate_agnostic(image: Image.Image, segmentation):
     img_np = np.array(image.resize((192, 256)))
     agnostic_np = img_np.copy()
     segmentation_resized = cv2.resize(segmentation.astype(np.uint8), (192, 256), interpolation=cv2.INTER_NEAREST)
-    agnostic_np[segmentation_resized == 4] = [128, 128, 128]  # Mask upper clothes
     return Image.fromarray(agnostic_np)
-def generate_tryon_output(agnostic_img, cloth_img):
     agnostic_tensor = img_transform(agnostic_img).unsqueeze(0).to(device)
     cloth_tensor = img_transform(cloth_img).unsqueeze(0).to(device)
     input_tensor = torch.cat([agnostic_tensor, cloth_tensor], dim=1)
     with torch.no_grad():
-        output = tryon_model(input_tensor)
-    output_img = output.squeeze(0).cpu().permute(1, 2, 0).numpy()
-    output_img = (output_img * 255).astype(np.uint8)
-    return Image.fromarray(output_img)
-# ----------------- Gradio Interface -----------------
-def virtual_tryon(person_image, cloth_image):
     segmentation = get_segmentation(person_image)
     agnostic = generate_agnostic(person_image, segmentation)
-    result = generate_tryon_output(agnostic, cloth_image)
     return agnostic, result
 demo = gr.Interface(
     fn=virtual_tryon,
     inputs=[
         gr.Image(type="pil", label="Person Image"),
-        gr.Image(type="pil", label="Cloth Image")
     ],
     outputs=[
         gr.Image(type="pil", label="Agnostic (Torso Masked)"),
         gr.Image(type="pil", label="Virtual Try-On Output")
     ],
-    title="👕 Virtual Try-On (UNet + Segformer)",
-    description="Upload a person image and a cloth image to try on the cloth virtually."
 )
 if __name__ == "__main__":

 import numpy as np
 from torchvision import transforms
 import cv2
 from transformers import AutoImageProcessor, SegformerForSemanticSegmentation
+from improved_viton import ImprovedUNetGenerator
+# ----------------- Device -----------------
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ----------------- Load Human Parser Model -----------------
 processor = AutoImageProcessor.from_pretrained("matei-dorian/segformer-b5-finetuned-human-parsing")
 parser_model = SegformerForSemanticSegmentation.from_pretrained(
     "matei-dorian/segformer-b5-finetuned-human-parsing"
         u4 = self.up4(torch.cat([u3, d1], dim=1))
         return u4
 # ----------------- Image Transforms -----------------
+# img_transform = transforms.Compose([
+#     transforms.Resize((256, 192)),
+#     transforms.ToTensor(),
+#     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+# ])
+#new changes
+if model_type == "UNet":
+    img_transform = transforms.Compose([
+        transforms.Resize((256, 192)),
+        transforms.ToTensor()
+    ])
+else:
+    img_transform = transforms.Compose([
+        transforms.Resize((256, 192)),
+        transforms.ToTensor(),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+    ])
+#end new changes
 # ----------------- Helper Functions -----------------
 def get_segmentation(image: Image.Image):
     img_np = np.array(image.resize((192, 256)))
     agnostic_np = img_np.copy()
     segmentation_resized = cv2.resize(segmentation.astype(np.uint8), (192, 256), interpolation=cv2.INTER_NEAREST)
+    clothing_labels = [4]
+    for label in clothing_labels:
+        agnostic_np[segmentation_resized == label] = [128, 128, 128]
     return Image.fromarray(agnostic_np)
+def load_model(model_type):
+    if model_type == "UNet":
+        model = UNetGenerator().to(device)
+        checkpoint = torch.load("viton_unet_full_checkpoint.pth", map_location=device)
+        state_dict = checkpoint.get("model_G_state_dict") or checkpoint.get("model_state_dict")
+    elif model_type == "GAN":
+        model = ImprovedUNetGenerator(in_channels=6, out_channels=3).to(device)
+        checkpoint = torch.load("viton_gan_full_checkpoint.pth", map_location=device)
+        state_dict = checkpoint.get("model_G_state_dict") or checkpoint.get("model_state_dict")
+    elif model_type == "Diffusion":
+        model = ImprovedUNetGenerator(in_channels=6, out_channels=3).to(device)
+        checkpoint = torch.load("viton_diffusion_full_checkpoint.pth", map_location=device)
+        state_dict = checkpoint.get("model_G_state_dict") or checkpoint.get("model_state_dict")
+    else:
+        raise ValueError("Invalid model type")
+    if state_dict is None:
+        raise KeyError(f"No valid state_dict found for model type {model_type}")
+    model.load_state_dict(state_dict)
+    model.eval()
+    return model
+# def generate_tryon_output(person_img, agnostic_img, cloth_img, segmentation, model):
+#     agnostic_tensor = img_transform(agnostic_img).unsqueeze(0).to(device)
+#     cloth_tensor = img_transform(cloth_img).unsqueeze(0).to(device)
+#     input_tensor = torch.cat([agnostic_tensor, cloth_tensor], dim=1)
+#     with torch.no_grad():
+#         output = model(input_tensor)
+#     output_img = output[0].cpu().permute(1, 2, 0).numpy()
+#     output_img = (output_img + 1) / 2
+#     output_img = np.clip(output_img, 0, 1)
+#     person_np = np.array(person_img.resize((192, 256))).astype(np.float32) / 255.0
+#     segmentation_resized = cv2.resize(segmentation.astype(np.uint8), (192, 256), interpolation=cv2.INTER_NEAREST)
+#     blend_mask = (segmentation_resized == 0).astype(np.float32)
+#     blend_mask = np.expand_dims(blend_mask, axis=2)
+#     final_output = blend_mask * person_np + (1 - blend_mask) * output_img
+#     final_output = (final_output * 255).astype(np.uint8)
+#     return Image.fromarray(final_output)
+#new changes
+def generate_tryon_output(person_img, agnostic_img, cloth_img, segmentation, model, model_type):
     agnostic_tensor = img_transform(agnostic_img).unsqueeze(0).to(device)
     cloth_tensor = img_transform(cloth_img).unsqueeze(0).to(device)
     input_tensor = torch.cat([agnostic_tensor, cloth_tensor], dim=1)
     with torch.no_grad():
+        output = model(input_tensor)
+    if model_type == "UNet":
+        output_img = output.squeeze(0).cpu().permute(1, 2, 0).numpy()
+        output_img = (output_img * 255).astype(np.uint8)
+        return Image.fromarray(output_img)
+    else:
+        output_img = output[0].cpu().permute(1, 2, 0).numpy()
+        output_img = (output_img + 1) / 2
+        output_img = np.clip(output_img, 0, 1)
+        person_np = np.array(person_img.resize((192, 256))).astype(np.float32) / 255.0
+        segmentation_resized = cv2.resize(segmentation.astype(np.uint8), (192, 256), interpolation=cv2.INTER_NEAREST)
+        blend_mask = (segmentation_resized == 0).astype(np.float32)
+        blend_mask = np.expand_dims(blend_mask, axis=2)
+        final_output = blend_mask * person_np + (1 - blend_mask) * output_img
+        final_output = (final_output * 255).astype(np.uint8)
+        return Image.fromarray(final_output)
+#new changes end
+# ----------------- Inference Pipeline -----------------
+def virtual_tryon(person_image, cloth_image, model_type):
     segmentation = get_segmentation(person_image)
     agnostic = generate_agnostic(person_image, segmentation)
+    model = load_model(model_type)
+    result = generate_tryon_output(person_image, agnostic, cloth_image, segmentation, model, model_type)
+    # result = generate_tryon_output(person_image, agnostic, cloth_image, segmentation, model)
     return agnostic, result
+# ----------------- Gradio Interface -----------------
 demo = gr.Interface(
     fn=virtual_tryon,
     inputs=[
         gr.Image(type="pil", label="Person Image"),
+        gr.Image(type="pil", label="Cloth Image"),
+        gr.Radio(choices=["UNet", "GAN", "Diffusion"], label="Model Type", value="UNet")
     ],
     outputs=[
         gr.Image(type="pil", label="Agnostic (Torso Masked)"),
         gr.Image(type="pil", label="Virtual Try-On Output")
     ],
+    title="👕 Virtual Try-On App",
+    description="Upload a person image and a clothing image, select a model (UNet, GAN, or Diffusion), and try it on virtually."
 )
 if __name__ == "__main__":