Spaces:

Wolowolo
/

FSFM-deepfake_diffusion_spoof_face_detection

Running

App Files Files Community

wolo-wolo commited on Feb 21

Commit

4d10ed1

1 Parent(s): e46e042

V1.0

Browse files

Files changed (12) hide show

app.py +249 -329
engine_finetune.py +58 -251
models_vit.py +1 -1
util/crop.py +1 -1
util/datasets.py +1 -1
util/lars.py +1 -1
util/loss_contrastive.py +1 -1
util/lr_decay.py +1 -1
util/lr_sched.py +1 -1
util/metrics.py +29 -1
util/misc.py +1 -1
util/pos_embed.py +1 -1

app.py CHANGED Viewed

@@ -1,220 +1,114 @@
 # -*- coding: utf-8 -*-
-# Author: Gaojian Wang@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.
 # --------------------------------------------------------
-# pip uninstall nvidia_cublas_cu11
 import sys
-sys.path.append('..')
 import os
 os.system(f'pip install dlib')
-import torch
 import numpy as np
 from PIL import Image
-from torch.nn import functional as F
 import gradio as gr
 import models_vit
 from util.datasets import build_dataset
-import argparse
-from engine_finetune import test_all
-import dlib
-from huggingface_hub import hf_hub_download
-P = os.path.abspath(__file__)
-FRAME_SAVE_PATH = os.path.join(P[:-6], 'frame')
-CKPT_SAVE_PATH = os.path.join(P[:-6], 'checkpoints')
-CKPT_LIST = ['DfD-Checkpoint_Fine-tuned_on_FF++',
-             'FAS-Checkpoint_Fine-tuned_on_MCIO']
-CKPT_NAME = {'DfD-Checkpoint_Fine-tuned_on_FF++': 'finetuned_models/FF++_c23_32frames/checkpoint-min_val_loss.pth',
-             'FAS-Checkpoint_Fine-tuned_on_MCIO': 'finetuned_models/MCIO_protocol/Both_MCIO/checkpoint-min_val_loss.pth'}
-os.makedirs(FRAME_SAVE_PATH, exist_ok=True)
-os.makedirs(CKPT_SAVE_PATH, exist_ok=True)
 def get_args_parser():
-    parser = argparse.ArgumentParser('MAE fine-tuning for image classification', add_help=False)
-    parser.add_argument('--batch_size', default=64, type=int,
-                        help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus')
     parser.add_argument('--epochs', default=50, type=int)
-    parser.add_argument('--accum_iter', default=1, type=int,
-                        help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)')
-    # Model parameters
-    parser.add_argument('--model', default='vit_large_patch16', type=str, metavar='MODEL',
-                        help='Name of model to train')
-    parser.add_argument('--input_size', default=224, type=int,
-                        help='images input size')
-    parser.add_argument('--normalize_from_IMN', action='store_true',
-                        help='cal mean and std from imagenet, else from pretrain datasets')
     parser.set_defaults(normalize_from_IMN=True)
-    parser.add_argument('--apply_simple_augment', action='store_true',
-                        help='apply simple data augment')
-    parser.add_argument('--drop_path', type=float, default=0.1, metavar='PCT',
-                        help='Drop path rate (default: 0.1)')
-    # Optimizer parameters
-    parser.add_argument('--clip_grad', type=float, default=None, metavar='NORM',
-                        help='Clip gradient norm (default: None, no clipping)')
-    parser.add_argument('--weight_decay', type=float, default=0.05,
-                        help='weight decay (default: 0.05)')
-    parser.add_argument('--lr', type=float, default=None, metavar='LR',
-                        help='learning rate (absolute lr)')
-    parser.add_argument('--blr', type=float, default=1e-3, metavar='LR',
-                        help='base learning rate: absolute_lr = base_lr * total_batch_size / 256')
-    parser.add_argument('--layer_decay', type=float, default=0.75,
-                        help='layer-wise lr decay from ELECTRA/BEiT')
-    parser.add_argument('--min_lr', type=float, default=1e-6, metavar='LR',
-                        help='lower lr bound for cyclic schedulers that hit 0')
-    parser.add_argument('--warmup_epochs', type=int, default=5, metavar='N',
-                        help='epochs to warmup LR')
-    # Augmentation parameters
-    parser.add_argument('--color_jitter', type=float, default=None, metavar='PCT',
-                        help='Color jitter factor (enabled only when not using Auto/RandAug)')
-    parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME',
-                        help='Use AutoAugment policy. "v0" or "original". " + "(default: rand-m9-mstd0.5-inc1)'),
-    parser.add_argument('--smoothing', type=float, default=0.1,
-                        help='Label smoothing (default: 0.1)')
-    # * Random Erase params
-    parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT',
-                        help='Random erase prob (default: 0.25)')
-    parser.add_argument('--remode', type=str, default='pixel',
-                        help='Random erase mode (default: "pixel")')
-    parser.add_argument('--recount', type=int, default=1,
-                        help='Random erase count (default: 1)')
-    parser.add_argument('--resplit', action='store_true', default=False,
-                        help='Do not random erase first (clean) augmentation split')
-    # * Mixup params
-    parser.add_argument('--mixup', type=float, default=0,
-                        help='mixup alpha, mixup enabled if > 0.')
-    parser.add_argument('--cutmix', type=float, default=0,
-                        help='cutmix alpha, cutmix enabled if > 0.')
-    parser.add_argument('--cutmix_minmax', type=float, nargs='+', default=None,
-                        help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
-    parser.add_argument('--mixup_prob', type=float, default=1.0,
-                        help='Probability of performing mixup or cutmix when either/both is enabled')
-    parser.add_argument('--mixup_switch_prob', type=float, default=0.5,
-                        help='Probability of switching to cutmix when both mixup and cutmix enabled')
-    parser.add_argument('--mixup_mode', type=str, default='batch',
-                        help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')
-    # * Finetuning params
-    parser.add_argument('--finetune', default='',
-                        help='finetune from checkpoint')
     parser.add_argument('--global_pool', action='store_true')
     parser.set_defaults(global_pool=True)
-    parser.add_argument('--cls_token', action='store_false', dest='global_pool',
-                        help='Use class token instead of global pool for classification')
-    # Dataset parameters
-    parser.add_argument('--data_path', default='/datasets01/imagenet_full_size/061417/', type=str,
-                        help='dataset path')
-    parser.add_argument('--nb_classes', default=1000, type=int,
-                        help='number of the classification types')
-    parser.add_argument('--output_dir', default='',
-                        help='path where to save, empty for no saving')
-    parser.add_argument('--log_dir', default='',
-                        help='path where to tensorboard log')
-    parser.add_argument('--device', default='cuda',
-                        help='device to use for training / testing')
     parser.add_argument('--seed', default=0, type=int)
-    parser.add_argument('--resume', default='',
-                        help='resume from checkpoint')
-    parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
-                        help='start epoch')
-    parser.add_argument('--eval', action='store_true',
-                        help='Perform evaluation only')
     parser.set_defaults(eval=True)
-    parser.add_argument('--dist_eval', action='store_true', default=False,
-                        help='Enabling distributed evaluation (recommended during training for faster monitor')
     parser.add_argument('--num_workers', default=10, type=int)
-    parser.add_argument('--pin_mem', action='store_true',
-                        help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
     parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
     parser.set_defaults(pin_mem=True)
-    # distributed training parameters
-    parser.add_argument('--world_size', default=1, type=int,
-                        help='number of distributed processes')
     parser.add_argument('--local_rank', default=-1, type=int)
     parser.add_argument('--dist_on_itp', action='store_true')
-    parser.add_argument('--dist_url', default='env://',
-                        help='url used to set up distributed training')
     return parser
-args = get_args_parser()
-args = args.parse_args()
-args.nb_classes = 2
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model = models_vit.__dict__['vit_base_patch16'](
-    num_classes=args.nb_classes,
-    drop_path_rate=args.drop_path,
-    global_pool=args.global_pool,
-).to(device)
-def load_model(ckpt):
-    if ckpt == 'choose from here' or 'continuously updating...':
-        return gr.update()
-    args.resume = os.path.join(CKPT_SAVE_PATH, CKPT_NAME[ckpt])
-    if os.path.isfile(args.resume) == False:
-        hf_hub_download(local_dir=CKPT_SAVE_PATH,
-                        repo_id='Wolowolo/fsfm-3c/' + CKPT_NAME[ckpt],
-                        filename=ckpt)
-    checkpoint = torch.load(args.resume, map_location='cpu')
-    model.load_state_dict(checkpoint['model'])
     model.eval()
-    return gr.update()
 def get_boundingbox(face, width, height, minsize=None):
-    """
-    From FF++:
-    https://github.com/ondyari/FaceForensics/blob/master/classification/detect_from_video.py
-    Expects a dlib face to generate a quadratic bounding box.
-    :param face: dlib face class
-    :param width: frame width
-    :param height: frame height
-    :param cfg.face_scale: bounding box size multiplier to get a bigger face region
-    :param minsize: set minimum bounding box size
-    :return: x, y, bounding_box_size in opencv form
-    """
-    x1 = face.left()
-    y1 = face.top()
-    x2 = face.right()
-    y2 = face.bottom()
     size_bb = int(max(x2 - x1, y2 - y1) * 1.3)
-    if minsize:
-        if size_bb < minsize:
-            size_bb = minsize
     center_x, center_y = (x1 + x2) // 2, (y1 + y2) // 2
-    # Check for out of bounds, x-y top left corner
-    x1 = max(int(center_x - size_bb // 2), 0)
-    y1 = max(int(center_y - size_bb // 2), 0)
-    # Check for too big bb size for given x, y
     size_bb = min(width - x1, size_bb)
     size_bb = min(height - y1, size_bb)
     return x1, y1, size_bb
@@ -222,200 +116,226 @@ def extract_face(frame):
     face_detector = dlib.get_frontal_face_detector()
     image = np.array(frame.convert('RGB'))
     faces = face_detector(image, 1)
-    if len(faces) > 0:
-        # For now only take the biggest face
         face = faces[0]
-        # Face crop and rescale(follow FF++)
         x, y, size = get_boundingbox(face, image.shape[1], image.shape[0])
-        # Get the landmarks/parts for the face in box d only with the five key points
         cropped_face = image[y:y + size, x:x + size]
-        # cropped_face = cv2.resize(cropped_face, (224, 224), interpolation=cv2.INTER_CUBIC)
         return Image.fromarray(cropped_face)
-    else:
-        return None
 def get_frame_index_uniform_sample(total_frame_num, extract_frame_num):
-    interval = np.linspace(0, total_frame_num - 1, num=extract_frame_num, dtype=int)
-    return interval.tolist()
-import cv2
-def extract_face_from_fixed_num_frames(src_video, dst_path, num_frames=None, device='cpu'):
-    """
-    1) extract specific num of frames from videos in [1st(index 0) frame, last frame] with uniform sample interval
-    2) extract face from frame with specific enlarge size
-    """
     video_capture = cv2.VideoCapture(src_video)
-    total_frames = video_capture.get(7)
-    # extract from the 1st(index 0) frame
-    if num_frames is not None:
-        frame_indices = get_frame_index_uniform_sample(total_frames, num_frames)
-    else:
-        frame_indices = range(int(total_frames))
     for frame_index in frame_indices:
         video_capture.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
         ret, frame = video_capture.read()
-        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-        img = extract_face(image)
-        if img == None:
-            continue
-        img = img.resize((224, 224), Image.BICUBIC)
         if not ret:
             continue
-        save_img_name = f"frame_{frame_index}.png"
-        img.save(os.path.join(dst_path, '0', save_img_name))
-        # cv2.imwrite(os.path.join(dst_path, '0', save_img_name), frame)
     video_capture.release()
-    # cv2.destroyAllWindows()
     return frame_indices
-def FSFM3C_video_detection(video, ckpt_select_dropdown):
-    # extract frames
-    num_frames = 32
-    files = os.listdir(FRAME_SAVE_PATH)
-    num_files = len(files)
-    frame_path = os.path.join(FRAME_SAVE_PATH, str(num_files))
     os.makedirs(frame_path, exist_ok=True)
     os.makedirs(os.path.join(frame_path, '0'), exist_ok=True)
-    frame_indices = extract_face_from_fixed_num_frames(video, frame_path, num_frames=num_frames, device=device)
-    args.data_path = frame_path
-    args.batch_size = 32
-    dataset_val = build_dataset(is_train=False, args=args)
-    sampler_val = torch.utils.data.SequentialSampler(dataset_val)
-    data_loader_val = torch.utils.data.DataLoader(
-        dataset_val, sampler=sampler_val,
-        batch_size=args.batch_size,
-        num_workers=args.num_workers,
-        pin_memory=args.pin_mem,
-        drop_last=False
-    )
-    frame_preds_list, video_pred_list = test_all(data_loader_val, model, device)
-    real_prob_frames = [round(1. - fake_score, 2)  for fake_score in video_pred_list]
-    frame_results = {f"frame_{frame}": f"{int(real_prob_frames[i] * 100)}%" for i, frame in enumerate(frame_indices)}
-    real_prob_video = int(round(1. - (sum(video_pred_list) / len(video_pred_list)), 2) * 100)
-    if real_prob_video > 50:
-        result_message = "real" if 'FAS' not in ckpt_select_dropdown else 'spoof'
-    else:
-        result_message = "fake" if 'FAS' not in ckpt_select_dropdown else 'real'
-    prob = 1 - real_prob_image if real_prob_video <= 50 else real_prob_video
-    image_results = (f"The face in this image may be {result_message} with probability is {real_prob_image}%")
-    video_results = (f"The face in this video may be {result_message} with probability {prob}")
-    return video_results
-def FSFM3C_image_detection(image, ckpt_select_dropdown):
-    files = os.listdir(FRAME_SAVE_PATH)
-    num_files = len(files)
-    frame_path = os.path.join(FRAME_SAVE_PATH, str(num_files))
-    os.makedirs(frame_path, exist_ok=True)
-    os.makedirs(os.path.join(frame_path, '0'), exist_ok=True)
-    save_img_name = f"frame_0.png"
     img = extract_face(image)
     if img is None:
-        return ['Invalid Input']
     img = img.resize((224, 224), Image.BICUBIC)
-    img.save(os.path.join(frame_path, '0', save_img_name))
     args.data_path = frame_path
     args.batch_size = 1
     dataset_val = build_dataset(is_train=False, args=args)
     sampler_val = torch.utils.data.SequentialSampler(dataset_val)
-    data_loader_val = torch.utils.data.DataLoader(
-        dataset_val, sampler=sampler_val,
-        batch_size=args.batch_size,
-        num_workers=args.num_workers,
-        pin_memory=args.pin_mem,
-        drop_last=False
-    )
-    frame_preds_list, video_pred_list = test_all(data_loader_val, model, device)
-    real_prob_image = int(round(1. - (sum(video_pred_list) / len(video_pred_list)), 2) * 100)
-    if real_prob_image > 50:
-        result_message = "real" if 'FAS' not in ckpt_select_dropdown else 'spoof'
-    else:
-        result_message = "fake" if 'FAS' not in ckpt_select_dropdown else 'real'
-    prob = 1 - real_prob_image if real_prob_image <= 50 else real_prob_image
-    image_results = (f"The face in this image may be {result_message} with probability is {real_prob_image}%")
-    return image_results
-# WebUI
-with gr.Blocks() as demo:
-    gr.HTML(
-        "<h1 style='text-align: center;'>🦱 Real Facial Image&Video Detection <br> Against Face Forgery and Spoofing (Deepfake/Diffusion/Presentation-attacks)</h1>")
-    gr.Markdown("### ---Powered by the fine-tuned model that is pre-trained from [FSFM-3C](https://fsfm-3c.github.io/)")
-    gr.Markdown("### Release:")
-    gr.Markdown("- <b>V1.0 [2024-12] (Current):</b> "
-                "Create this page with basic detectors (simply fine-tuned models) that follow the paper implementation. "
-                "<b>Notes:</b> Performance is limited because no any optimization of data, models, hyperparameters, etc. is done for downstream tasks. <br> "
-                "<b>[TODO]: </b> Update practical models, and optimized interfaces, and provide more functions such as visualizations, a unified detector, and multi-modal diagnosis.")
-    gr.Markdown(
-        "> Please provide an <b>image</b> or a <b>video (<100s </b>, default to uniform sampling 32 frames)</b> and <b>select the model</b> for detection. <br>"
-        "- <b>DfD-Checkpoint_Fine-tuned_on_FF++</b> for deepfake detection, FSFM VIT-B fine-tuned on the FF++_c23 dataset (train&val sets of  4 manipulations, 32 frames per video) <br>"
-        "- <b>FAS-Checkpoint_Fine-tuned_on_MCIO</b> for face anti-spoofing, FSFM VIT-B fine-tuned on the MCIO datasets (2 frames per video) ")
-    with gr.Column():
         ckpt_select_dropdown = gr.Dropdown(
-            label="Select the Model Checkpoint for Detection (🖱️ below)",
-            choices=['choose from here'] + CKPT_LIST + ['continuously updating...'],
             multiselect=False,
-            value='choose from here',
             interactive=True,
         )
-        with gr.Row(elem_classes="center-align"):
-            with gr.Column(scale=5):
-                gr.Markdown(
-                    "## Image Detection"
-                )
-                image = gr.Image(label="Upload/Capture/Paste your image", type="pil")
-                image_submit_btn = gr.Button("Submit")
-                output_results_image = gr.Textbox(label="Detection Result")
-            with gr.Column(scale=5):
-                gr.Markdown(
-                    "## Video Detection"
-                )
-                video = gr.Video(label="Upload/Capture your video")
-                video_submit_btn = gr.Button("Submit")
-                output_results_video = gr.Textbox(label="Detection Result")
     image_submit_btn.click(
         fn=FSFM3C_image_detection,
-        inputs=[image, ckpt_select_dropdown],
         outputs=[output_results_image],
     )
     video_submit_btn.click(
         fn=FSFM3C_video_detection,
-        inputs=[video, ckpt_select_dropdown],
         outputs=[output_results_video],
     )
-    ckpt_select_dropdown.change(
-        fn=load_model,
-        inputs=[ckpt_select_dropdown],
-        outputs=[ckpt_select_dropdown],
-    )
 if __name__ == "__main__":
     gr.close_all()
     demo.queue()
-    demo.launch()

 # -*- coding: utf-8 -*-
+# Author: Gaojian Wang@ZJUICSR; TongWu@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.
 # --------------------------------------------------------
 import sys
 import os
 os.system(f'pip install dlib')
+import dlib
+import argparse
 import numpy as np
 from PIL import Image
+import cv2
+import torch
+from huggingface_hub import hf_hub_download
 import gradio as gr
 import models_vit
 from util.datasets import build_dataset
+from engine_finetune import test_two_class, test_multi_class
 def get_args_parser():
+    parser = argparse.ArgumentParser('FSFM3C fine-tuning&Testing for image classification', add_help=False)
+    parser.add_argument('--batch_size', default=64, type=int, help='Batch size per GPU')
     parser.add_argument('--epochs', default=50, type=int)
+    parser.add_argument('--accum_iter', default=1, type=int, help='Accumulate gradient iterations')
+    parser.add_argument('--model', default='vit_large_patch16', type=str, metavar='MODEL', help='Name of model to train')
+    parser.add_argument('--input_size', default=224, type=int, help='images input size')
+    parser.add_argument('--normalize_from_IMN', action='store_true', help='cal mean and std from imagenet')
     parser.set_defaults(normalize_from_IMN=True)
+    parser.add_argument('--apply_simple_augment', action='store_true', help='apply simple data augment')
+    parser.add_argument('--drop_path', type=float, default=0.1, metavar='PCT', help='Drop path rate')
+    parser.add_argument('--clip_grad', type=float, default=None, metavar='NORM', help='Clip gradient norm')
+    parser.add_argument('--weight_decay', type=float, default=0.05, help='weight decay')
+    parser.add_argument('--lr', type=float, default=None, metavar='LR', help='learning rate')
+    parser.add_argument('--blr', type=float, default=1e-3, metavar='LR', help='base learning rate')
+    parser.add_argument('--layer_decay', type=float, default=0.75, help='layer-wise lr decay')
+    parser.add_argument('--min_lr', type=float, default=1e-6, metavar='LR', help='lower lr bound')
+    parser.add_argument('--warmup_epochs', type=int, default=5, metavar='N', help='epochs to warmup LR')
+    parser.add_argument('--color_jitter', type=float, default=None, metavar='PCT', help='Color jitter factor')
+    parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME', help='Use AutoAugment policy')
+    parser.add_argument('--smoothing', type=float, default=0.1, help='Label smoothing')
+    parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT', help='Random erase prob')
+    parser.add_argument('--remode', type=str, default='pixel', help='Random erase mode')
+    parser.add_argument('--recount', type=int, default=1, help='Random erase count')
+    parser.add_argument('--resplit', action='store_true', default=False, help='Do not random erase first augmentation split')
+    parser.add_argument('--mixup', type=float, default=0, help='mixup alpha')
+    parser.add_argument('--cutmix', type=float, default=0, help='cutmix alpha')
+    parser.add_argument('--cutmix_minmax', type=float, nargs='+', default=None, help='cutmix min/max ratio')
+    parser.add_argument('--mixup_prob', type=float, default=1.0, help='Probability of performing mixup or cutmix')
+    parser.add_argument('--mixup_switch_prob', type=float, default=0.5, help='Probability of switching to cutmix')
+    parser.add_argument('--mixup_mode', type=str, default='batch', help='How to apply mixup/cutmix params')
+    parser.add_argument('--finetune', default='', help='finetune from checkpoint')
     parser.add_argument('--global_pool', action='store_true')
     parser.set_defaults(global_pool=True)
+    parser.add_argument('--cls_token', action='store_false', dest='global_pool', help='Use class token for classification')
+    parser.add_argument('--data_path', default='/datasets01/imagenet_full_size/061417/', type=str, help='dataset path')
+    parser.add_argument('--nb_classes', default=1000, type=int, help='number of the classification types')
+    parser.add_argument('--output_dir', default='', help='path where to save')
+    parser.add_argument('--log_dir', default='', help='path where to tensorboard log')
+    parser.add_argument('--device', default='cuda', help='device to use for training / testing')
     parser.add_argument('--seed', default=0, type=int)
+    parser.add_argument('--resume', default='', help='resume from checkpoint')
+    parser.add_argument('--start_epoch', default=0, type=int, metavar='N', help='start epoch')
+    parser.add_argument('--eval', action='store_true', help='Perform evaluation only')
     parser.set_defaults(eval=True)
+    parser.add_argument('--dist_eval', action='store_true', default=False, help='Enabling distributed evaluation')
     parser.add_argument('--num_workers', default=10, type=int)
+    parser.add_argument('--pin_mem', action='store_true', help='Pin CPU memory in DataLoader')
     parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
     parser.set_defaults(pin_mem=True)
+    parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')
     parser.add_argument('--local_rank', default=-1, type=int)
     parser.add_argument('--dist_on_itp', action='store_true')
+    parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
     return parser
+def load_model(select_skpt):
+    global ckpt, device, model, checkpoint
+    if select_skpt not in CKPT_NAME:
+        return gr.update(), "Select a correct model"
+    ckpt = select_skpt
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    args.nb_classes = CKPT_CLASS[ckpt]
+    model = models_vit.__dict__[CKPT_MODEL[ckpt]](
+        num_classes=args.nb_classes,
+        drop_path_rate=args.drop_path,
+        global_pool=args.global_pool,
+    ).to(device)
+    args.resume = os.path.join(CKPT_SAVE_PATH, ckpt)
+    args.resume = CKPT_PATH[ckpt]
+    checkpoint = torch.load(args.resume, map_location=device)
+    model.load_state_dict(checkpoint['model'], strict=False)
     model.eval()
+    return gr.update(), f"[Loaded Model Successfully:] {args.resume}] "
 def get_boundingbox(face, width, height, minsize=None):
+    x1, y1, x2, y2 = face.left(), face.top(), face.right(), face.bottom()
     size_bb = int(max(x2 - x1, y2 - y1) * 1.3)
+    if minsize and size_bb < minsize:
+        size_bb = minsize
     center_x, center_y = (x1 + x2) // 2, (y1 + y2) // 2
+    x1, y1 = max(int(center_x - size_bb // 2), 0), max(int(center_y - size_bb // 2), 0)
     size_bb = min(width - x1, size_bb)
     size_bb = min(height - y1, size_bb)
     return x1, y1, size_bb
     face_detector = dlib.get_frontal_face_detector()
     image = np.array(frame.convert('RGB'))
     faces = face_detector(image, 1)
+    if faces:
         face = faces[0]
         x, y, size = get_boundingbox(face, image.shape[1], image.shape[0])
         cropped_face = image[y:y + size, x:x + size]
         return Image.fromarray(cropped_face)
+    return None
 def get_frame_index_uniform_sample(total_frame_num, extract_frame_num):
+    return np.linspace(0, total_frame_num - 1, num=extract_frame_num, dtype=int).tolist()
+def extract_face_from_fixed_num_frames(src_video, dst_path, num_frames=None):
     video_capture = cv2.VideoCapture(src_video)
+    total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
+    frame_indices = get_frame_index_uniform_sample(total_frames, num_frames) if num_frames else range(total_frames)
     for frame_index in frame_indices:
         video_capture.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
         ret, frame = video_capture.read()
         if not ret:
             continue
+        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        img = extract_face(image)
+        if img:
+            img = img.resize((224, 224), Image.BICUBIC)
+            save_img_name = f"frame_{frame_index}.png"
+            img.save(os.path.join(dst_path, '0', save_img_name))
     video_capture.release()
     return frame_indices
+def FSFM3C_image_detection(image):
+    frame_path = os.path.join(FRAME_SAVE_PATH, str(len(os.listdir(FRAME_SAVE_PATH))))
     os.makedirs(frame_path, exist_ok=True)
     os.makedirs(os.path.join(frame_path, '0'), exist_ok=True)
     img = extract_face(image)
     if img is None:
+        return 'No face detected, please upload a clear face!'
     img = img.resize((224, 224), Image.BICUBIC)
+    img.save(os.path.join(frame_path, '0', "frame_0.png"))
     args.data_path = frame_path
     args.batch_size = 1
     dataset_val = build_dataset(is_train=False, args=args)
     sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+    data_loader_val = torch.utils.data.DataLoader(dataset_val, sampler=sampler_val, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=False)
+    if CKPT_CLASS[ckpt] > 2:
+        frame_preds_list, video_pred_list = test_multi_class(data_loader_val, model, device)
+        class_names = ['Real or Bonafide', 'Deepfake', 'Diffusion or AIGC generated', 'Spoofing or Presentation-attack']
+        avg_video_pred = np.mean(video_pred_list, axis=0)
+        max_prob_index = np.argmax(avg_video_pred)
+        max_prob_class = class_names[max_prob_index]
+        probabilities = [f"{class_names[i]}: {prob * 100:.1f}%" for i, prob in enumerate(avg_video_pred)]
+        image_results = f"The largest face in this image may be {max_prob_class} with probability: \n [{', '.join(probabilities)}]"
+        return image_results
+    if CKPT_CLASS[ckpt] == 2:
+        frame_preds_list, video_pred_list = test_two_class(data_loader_val, model, device)
+        if ckpt == 'DfD-Checkpoint_Fine-tuned_on_FF++':
+            prob = sum(video_pred_list) / len(video_pred_list)
+            label = "Deepfake" if prob <= 0.5 else "Real"
+            prob = prob if label == "Real" else 1 - prob
+        if ckpt == 'FAS-Checkpoint_Fine-tuned_on_MCIO':
+            prob = sum(video_pred_list) / len(video_pred_list)
+            label = "Spoofing" if prob <= 0.5 else "Bonafide"
+            prob = prob if label == "Bonafide" else 1 - prob
+        image_results = f"The largest face in this image may be {label} with probability {prob * 100:.1f}%"
+        return image_results
+def FSFM3C_video_detection(video, num_frames):
+    try:
+        frame_path = os.path.join(FRAME_SAVE_PATH, str(len(os.listdir(FRAME_SAVE_PATH))))
+        os.makedirs(frame_path, exist_ok=True)
+        os.makedirs(os.path.join(frame_path, '0'), exist_ok=True)
+        frame_indices = extract_face_from_fixed_num_frames(video, frame_path, num_frames=num_frames)
+        args.data_path = frame_path
+        args.batch_size = num_frames
+        dataset_val = build_dataset(is_train=False, args=args)
+        sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+        data_loader_val = torch.utils.data.DataLoader(dataset_val, sampler=sampler_val, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=False)
+        if CKPT_CLASS[ckpt] > 2:
+            frame_preds_list, video_pred_list = test_multi_class(data_loader_val, model, device)
+            class_names = ['Real or Bonafide', 'Deepfake', 'Diffusion or AIGC generated', 'Spoofing or Presentation-attack']
+            avg_video_pred = np.mean(video_pred_list, axis=0)
+            max_prob_index = np.argmax(avg_video_pred)
+            max_prob_class = class_names[max_prob_index]
+            probabilities = [f"{class_names[i]}: {prob * 100:.1f}%" for i, prob in enumerate(avg_video_pred)]
+            frame_results = {f"frame_{frame_indices[i]}": [f"{class_names[j]}: {prob * 100:.1f}%" for j, prob in enumerate(frame_preds_list[i])] for i in range(len(frame_indices))}
+            video_results = (f"The largest face in this image may be {max_prob_class} with probability: \n [{', '.join(probabilities)}]\n \n"
+                             f"The frame-level detection results ['frame_index': 'probabilities']: \n{frame_results}")
+            return video_results
+        if CKPT_CLASS[ckpt] == 2:
+            frame_preds_list, video_pred_list = test_two_class(data_loader_val, model, device)
+            if ckpt == 'DfD-Checkpoint_Fine-tuned_on_FF++':
+                prob = sum(video_pred_list) / len(video_pred_list)
+                label = "Deepfake" if prob <= 0.5 else "Real"
+                prob = prob if label == "Real" else 1 - prob
+                frame_results = {f"frame_{frame_indices[i]}": f"{(frame_preds_list[i]) * 100:.1f}%" for i in
+                                 range(len(frame_indices))} if label == "Real" else {f"frame_{frame_indices[i]}": f"{(1 - frame_preds_list[i]) * 100:.1f}%" for i in
+                                 range(len(frame_indices))}
+            if ckpt == 'FAS-Checkpoint_Fine-tuned_on_MCIO':
+                prob = sum(video_pred_list) / len(video_pred_list)
+                label = "Spoofing" if prob <= 0.5 else "Bonafide"
+                prob = prob if label == "Bonafide" else 1 - prob
+                frame_results = {f"frame_{frame_indices[i]}": f"{(frame_preds_list[i]) * 100:.1f}%" for i in
+                                 range(len(frame_indices))} if label == "Bonafide" else {f"frame_{frame_indices[i]}": f"{(1 - frame_preds_list[i]) * 100:.1f}%" for i in
+                                 range(len(frame_indices))}
+            video_results = (f"The largest face in this image may be {label} with probability {prob * 100:.1f}%\n \n"
+                            f"The frame-level detection results ['frame_index': 'real_face_probability']: \n{frame_results}")
+            return video_results
+    except Exception as e:
+        return f"Error occurred. Please provide a clear face video or reduce the number of frames."
+# Paths and Constants
+P = os.path.abspath(__file__)
+FRAME_SAVE_PATH = os.path.join(os.path.dirname(P), 'frame')
+CKPT_SAVE_PATH = os.path.join(os.path.dirname(P), 'checkpoints')
+os.makedirs(FRAME_SAVE_PATH, exist_ok=True)
+os.makedirs(CKPT_SAVE_PATH, exist_ok=True)
+CKPT_NAME = [
+    '✨Unified-detector_v1_Fine-tuned_on_4_classes',
+    'DfD-Checkpoint_Fine-tuned_on_FF++',
+    'FAS-Checkpoint_Fine-tuned_on_MCIO',
+]
+# CKPT_PATH = {
+#     '✨Unified-detector_v1_Fine-tuned_on_4_classes': 'finetuned_models/Unified-detector/v1_Fine-tuned_on_4_classes/checkpoint-min_val_loss.pth',
+#     'DfD-Checkpoint_Fine-tuned_on_FF++': 'finetuned_models/FF++_c23_32frames/checkpoint-min_val_loss.pth',
+#     'FAS-Checkpoint_Fine-tuned_on_MCIO': 'finetuned_models/MCIO_protocol/Both_MCIO/checkpoint-min_val_loss.pth',
+# }
+CKPT_PATH = {
+    '✨Unified-detector_v1_Fine-tuned_on_4_classes': './checkpoints/checkpoint-min_train_loss.pth',
+    'DfD-Checkpoint_Fine-tuned_on_FF++': '/mnt/localDisk2/wgj/FSFM/released/FSFM-main/fsfm-3c/finuetune/cross_dataset_DfD/checkpoint/finetuned_models/ft_on_FF++_c23_32frames/pt_from_VF2_ViT-B_epoch600/checkpoint-min_val_loss.pth',
+    'FAS-Checkpoint_Fine-tuned_on_MCIO': '/mnt/localDisk2/wgj/FSFM/FSFM-3C/codespace/fsfm-3c/finuetune/cross_dataset_DfD/finetuned_models/FAS_MCIO/checkpoint-199.pth',
+}
+CKPT_CLASS = {
+    '✨Unified-detector_v1_Fine-tuned_on_4_classes': 4,
+    'DfD-Checkpoint_Fine-tuned_on_FF++': 2,
+    'FAS-Checkpoint_Fine-tuned_on_MCIO': 2
+}
+CKPT_MODEL = {
+    '✨Unified-detector_v1_Fine-tuned_on_4_classes': 'vit_base_patch16',
+    'DfD-Checkpoint_Fine-tuned_on_FF++': 'vit_base_patch16',
+    'FAS-Checkpoint_Fine-tuned_on_MCIO': 'vit_base_patch16',
+}
+with gr.Blocks(css=".custom-label { font-weight: bold !important; font-size: 16px !important; }") as demo:
+    gr.HTML("<h1 style='text-align: center;'>🦱 Real Facial Image&Video Detection <br> Against Face Forgery (Deepfake/Diffusion) and Spoofing (Presentation-attacks)</h1>")
+    gr.Markdown("<b>☉ Powered by the fine-tuned model that is pre-trained from [FSFM-3C](https://fsfm-3c.github.io/)</b> <br> "
+                "<b>☉ Release (Continuously updating) </b> <br> <b>[V1.0]</b> 2025/02/22-Current🎉: "
+                "1) Updated <b>[✨Unified-detector_v1] for Unified Physical-Digital Face Attack&Forgery Detection, a vanilla ViT-B/16-224 (FSFM Pre-trained) that could identify Real&Bonafide, Deepfake, Diffduion&AIGC, Spooing&Presentation-attacks facial images or videos </b> ; 2) Provided the selection of the number of video frames (uniformly sampling, more frames are too time-consuming, and we would be grateful if you support us to open paid GPU acceleration); 3) Fixed the errors of V0.1 including loading model and prediction. <br>"
+                "<b>[V0.1]</b> 2024/12-2025/02/21: "
+                "Create this page with basic detectors [DfD-Checkpoint_Fine-tuned_on_FF++, FAS-Checkpoint_Fine-tuned_on_MCIO] that follow the paper implementation. <br> ")
+    gr.Markdown("- Please <b>provide a facial image or video(<100s)</b>, and <b>select the model</b> for detection: <br> <b>[suggest] [✨Unified-detector_v1_Fine-tuned_on_4_classes]</b> <b>a (FSFM Pre-trained) ViT-B/16-224 for Both Real/Deepfake/Diffusion/Spoofing facial images&videos Detection <b> <br> <b>[DfD-Checkpoint_Fine-tuned_on_FF++]</b> for deepfake detection, FSFM ViT-B/16-224 fine-tuned on the FF++_c23 train&val sets (4 manipulations, 32 frames per video) <br> <b>[FAS-Checkpoint_Fine-tuned_on_MCIO]</b> for face anti-spoofing, FSFM ViT-B/16-224 fine-tuned on the MCIO datasets (2 frames per video)")
+    with gr.Row():
         ckpt_select_dropdown = gr.Dropdown(
+            label="Select the Model for Detection ⬇️",
+            elem_classes="custom-label",
+            choices=['Choose Model Here 🖱️'] + CKPT_NAME + ['continuously updating...'],
             multiselect=False,
+            value='Choose Model Here 🖱️',
             interactive=True,
         )
+        model_loading_status = gr.Textbox(label="Model Loading Status")
+    with gr.Row():
+        with gr.Column(scale=5):
+            gr.Markdown("### Image Detection")
+            image = gr.Image(label="Upload/Capture/Paste your image", type="pil")
+            image_submit_btn = gr.Button("Submit")
+            output_results_image = gr.Textbox(label="Detection Result")
+        with gr.Column(scale=5):
+            gr.Markdown("### Video Detection")
+            video = gr.Video(label="Upload/Capture your video")
+            frame_slider = gr.Slider(minimum=1, maximum=32, step=1, value=32, label="Number of Frames for Detection")
+            video_submit_btn = gr.Button("Submit")
+            output_results_video = gr.Textbox(label="Detection Result")
+    ckpt_select_dropdown.change(
+        fn=load_model,
+        inputs=[ckpt_select_dropdown],
+        outputs=[ckpt_select_dropdown, model_loading_status],
+    )
     image_submit_btn.click(
         fn=FSFM3C_image_detection,
+        inputs=[image],
         outputs=[output_results_image],
     )
     video_submit_btn.click(
         fn=FSFM3C_video_detection,
+        inputs=[video, frame_slider],
         outputs=[output_results_video],
     )
 if __name__ == "__main__":
+    args = get_args_parser()
+    args = args.parse_args()
+    ckpt = 'DfD-Checkpoint_Fine-tuned_on_FF++'
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    args.nb_classes = CKPT_CLASS[ckpt]
+    model = models_vit.__dict__[CKPT_MODEL[ckpt]](
+        num_classes=args.nb_classes,
+        drop_path_rate=args.drop_path,
+        global_pool=args.global_pool,
+    ).to(device)
+    args.resume = os.path.join(CKPT_SAVE_PATH, ckpt)
+    args.resume = CKPT_PATH[ckpt]
+    checkpoint = torch.load(args.resume, map_location=device)
+    model.load_state_dict(checkpoint['model'], strict=False)
+    model.eval()
     gr.close_all()
     demo.queue()
+    # demo.launch()
+    demo.launch(server_name="0.0.0.0", server_port=8888)

engine_finetune.py CHANGED Viewed

@@ -1,323 +1,130 @@
 # -*- coding: utf-8 -*-
-# Author: Gaojian Wang@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.
 # --------------------------------------------------------
-import math
-import sys
-from typing import Iterable, Optional
 import numpy as np
 import torch
-from timm.data import Mixup
-from timm.utils import accuracy
 import util.misc as misc
-import util.lr_sched as lr_sched
 from util.metrics import *
-import torch.nn.functional as F
-def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
-                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
-                    device: torch.device, epoch: int, loss_scaler, max_norm: float = 0,
-                    mixup_fn: Optional[Mixup] = None, log_writer=None,
-                    args=None):
-    model.train(True)
-    metric_logger = misc.MetricLogger(delimiter="  ")
-    metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}'))
-    header = 'Epoch: [{}]'.format(epoch)
-    print_freq = 20
-    accum_iter = args.accum_iter
-    optimizer.zero_grad()
-    if log_writer is not None:
-        print('log_dir: {}'.format(log_writer.log_dir))
-    for data_iter_step, (samples, targets) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
-        # we use a per iteration (instead of per epoch) lr scheduler
-        if data_iter_step % accum_iter == 0:
-            lr_sched.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args)
-        samples = samples.to(device, non_blocking=True)
-        targets = targets.to(device, non_blocking=True)
-        if mixup_fn is not None:
-            samples, targets = mixup_fn(samples, targets)
-        with torch.cuda.amp.autocast():
-            # outputs = model(samples)
-            outputs = model(samples).to(device, non_blocking=True)  # modified
-            loss = criterion(outputs, targets)
-        loss_value = loss.item()
-        if not math.isfinite(loss_value):
-            print("Loss is {}, stopping training".format(loss_value))
-            sys.exit(1)
-        loss /= accum_iter
-        loss_scaler(loss, optimizer, clip_grad=max_norm,
-                    parameters=model.parameters(), create_graph=False,
-                    update_grad=(data_iter_step + 1) % accum_iter == 0)
-        if (data_iter_step + 1) % accum_iter == 0:
-            optimizer.zero_grad()
-        torch.cuda.synchronize()
-        metric_logger.update(loss=loss_value)
-        min_lr = 10.
-        max_lr = 0.
-        for group in optimizer.param_groups:
-            min_lr = min(min_lr, group["lr"])
-            max_lr = max(max_lr, group["lr"])
-        metric_logger.update(lr=max_lr)
-        loss_value_reduce = misc.all_reduce_mean(loss_value)
-        if log_writer is not None and (data_iter_step + 1) % accum_iter == 0:
-            """ We use epoch_1000x as the x-axis in tensorboard.
-            This calibrates different curves when batch size changes.
-            """
-            epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000)
-            log_writer.add_scalar('loss', loss_value_reduce, epoch_1000x)
-            log_writer.add_scalar('lr', max_lr, epoch_1000x)
-    # gather the stats from all processes
-    metric_logger.synchronize_between_processes()
-    print("Averaged stats:", metric_logger)
-    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
-@torch.no_grad()
-def evaluate(data_loader, model, device):
-    criterion = torch.nn.CrossEntropyLoss()
-    metric_logger = misc.MetricLogger(delimiter="  ")
-    header = 'Test:'
-    # switch to evaluation mode
-    model.eval()
-    for batch in metric_logger.log_every(data_loader, 10, header):
-        images = batch[0]
-        target = batch[-1]
-        images = images.to(device, non_blocking=True)
-        target = target.to(device, non_blocking=True)
-        # compute output
-        with torch.cuda.amp.autocast():
-            # output = model(images)
-            output = model(images).to(device, non_blocking=True)  # modified
-            loss = criterion(output, target)
-        # acc1, acc5 = accuracy(output, target, topk=(1, 5))
-        acc = float(accuracy(output, target, topk=(1,))[0])
-        preds = (F.softmax(output, dim=1)[:, 1].detach().cpu().numpy())
-        trues = (target.detach().cpu().numpy())
-        auc_score = roc_auc_score(trues, preds) * 100.
-        batch_size = images.shape[0]
-        metric_logger.update(loss=loss.item())
-        # metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
-        # metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
-        metric_logger.meters['acc'].update(acc, n=batch_size)
-        metric_logger.meters['auc'].update(auc_score, n=batch_size)
-    # gather the stats from all processes
-    metric_logger.synchronize_between_processes()
-    # print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}'
-    #       .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss))
-    print('* Acc {acc.global_avg:.3f} Auc {auc.global_avg:.3f}  loss {losses.global_avg:.3f}'
-          .format(acc=metric_logger.acc, auc=metric_logger.auc, losses=metric_logger.loss))
-    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
-@torch.no_grad()
-def test_ori(data_loader, model, device):
-    criterion = torch.nn.CrossEntropyLoss()
-    metric_logger = misc.MetricLogger(delimiter="  ")
-    header = 'Test:'
-    # switch to evaluation mode
-    model.eval()
-    labels = np.array([])
-    preds = np.array([])
-    for batch in metric_logger.log_every(data_loader, 10, header):
-        images = batch[0]
-        target = batch[-1]
-        images = images.to(device, non_blocking=True)
-        target = target.to(device, non_blocking=True)
-        # compute output
-        with torch.cuda.amp.autocast():
-            # output = model(images)
-            output = model(images).to(device, non_blocking=True)  # modified
-            loss = criterion(output, target)
-        # acc1, acc5 = accuracy(output, target, topk=(1, 5))
-        acc = float(accuracy(output, target, topk=(1,))[0])
-        pred = (F.softmax(output, dim=1)[:, 1].detach().cpu().numpy())
-        preds = np.append(preds, pred)
-        label = (target.detach().cpu().numpy())
-        labels = np.append(labels, label)
-        batch_size = images.shape[0]
-        metric_logger.update(loss=loss.item())
-        # metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
-        # metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
-        metric_logger.meters['acc'].update(acc, n=batch_size)
-    # gather the stats from all processes
-    metric_logger.synchronize_between_processes()
-    # print('* Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f} loss {losses.global_avg:.3f}'
-    #       .format(top1=metric_logger.acc1, top5=metric_logger.acc5, losses=metric_logger.loss))
-    auc_score = roc_auc_score(labels, preds) * 100.
-    metric_logger.meters['auc'].update(auc_score)
-    print('* Acc {acc.global_avg:.3f} Auc {auc.global_avg:.3f}  loss {losses.global_avg:.3f}'
-          .format(acc=metric_logger.acc, auc=metric_logger.auc, losses=metric_logger.loss))
-    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
 @torch.no_grad()
-def test(data_loader, model, device):
     criterion = torch.nn.CrossEntropyLoss()
-    metric_logger = misc.MetricLogger(delimiter="  ")
-    header = 'Test:'
     # switch to evaluation mode
     model.eval()
     frame_labels = np.array([])  # int label
     frame_preds = np.array([])  # pred logit
     frame_y_preds = np.array([])  # pred int
-    # for batch in metric_logger.log_every(data_loader, print_freq=len(data_loader), header=header):
     for batch in data_loader:
         images = batch[0]  # torch.Size([BS, C, H, W])
         target = batch[1]  # torch.Size([BS])
         images = images.to(device, non_blocking=True)
         target = target.to(device, non_blocking=True)
-        # compute output
-        with torch.cuda.amp.autocast():
-            # output = model(images)
-            output = model(images).to(device, non_blocking=True)  # modified
-            loss = criterion(output, target)
         frame_pred = (F.softmax(output, dim=1)[:, 1].detach().cpu().numpy())
         frame_preds = np.append(frame_preds, frame_pred)
         frame_y_pred = np.argmax(output.detach().cpu().numpy(), axis=1)
         frame_y_preds = np.append(frame_y_preds, frame_y_pred)
         frame_label = (target.detach().cpu().numpy())
         frame_labels = np.append(frame_labels, frame_label)
-        metric_logger.update(loss=loss.item())
-    # gather the stats from all processes
-    metric_logger.synchronize_between_processes()
-    metric_logger.meters['frame_acc'].update(frame_level_acc(frame_labels, frame_y_preds))
-    metric_logger.meters['frame_balanced_acc'].update(frame_level_balanced_acc(frame_labels, frame_y_preds))
-    metric_logger.meters['frame_auc'].update(frame_level_auc(frame_labels, frame_preds))
-    metric_logger.meters['frame_eer'].update(frame_level_eer(frame_labels, frame_preds))
-    print('*[------FRAME-LEVEL------] \n'
-          'Acc {frame_acc.global_avg:.3f} Balanced_Acc {frame_balanced_acc.global_avg:.3f} '
-          'Auc {frame_auc.global_avg:.3f} EER {frame_eer.global_avg:.3f} loss {losses.global_avg:.3f}'
-          .format(frame_acc=metric_logger.frame_acc, frame_balanced_acc=metric_logger.frame_balanced_acc,
-                  frame_auc=metric_logger.frame_auc, frame_eer=metric_logger.frame_eer, losses=metric_logger.loss))
-    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
 @torch.no_grad()
-def test_all(data_loader, model, device):
     criterion = torch.nn.CrossEntropyLoss()
-    metric_logger = misc.MetricLogger(delimiter="  ")
-    header = 'Test:'
     # switch to evaluation mode
     model.eval()
     frame_labels = np.array([])  # int label
-    frame_preds = np.array([])  # pred logit
     frame_y_preds = np.array([])  # pred int
     video_names_list = list()
-    # for batch in metric_logger.log_every(data_loader, print_freq=len(data_loader), header=header):
     for batch in data_loader:
         images = batch[0]  # torch.Size([BS, C, H, W])
         target = batch[1]  # torch.Size([BS])
         video_name = batch[-1]  # list[BS]
         images = images.to(device, non_blocking=True)
         target = target.to(device, non_blocking=True)
-        # compute output
-        # with torch.cuda.amp.autocast():
-            # output = model(images)
-        output = model(images).to(device, non_blocking=True)  # modified
         loss = criterion(output, target)
-        frame_pred = (F.softmax(output, dim=1)[:, 1].detach().cpu().numpy())
-        frame_preds = np.append(frame_preds, frame_pred)
         frame_y_pred = np.argmax(output.detach().cpu().numpy(), axis=1)
         frame_y_preds = np.append(frame_y_preds, frame_y_pred)
-        frame_label = (target.detach().cpu().numpy())
         frame_labels = np.append(frame_labels, frame_label)
         video_names_list.extend(list(video_name))
-        metric_logger.update(loss=loss.item())
-    # gather the stats from all processes
-    # metric_logger.synchronize_between_processes()
-    # metric_logger.meters['frame_acc'].update(frame_level_acc(frame_labels, frame_y_preds))
-    # metric_logger.meters['frame_balanced_acc'].update(frame_level_balanced_acc(frame_labels, frame_y_preds))
-    # metric_logger.meters['frame_auc'].update(frame_level_auc(frame_labels, frame_preds))
-    # metric_logger.meters['frame_eer'].update(frame_level_eer(frame_labels, frame_preds))
-    # print('*[------FRAME-LEVEL------] \n'
-    #       'Acc {frame_acc.global_avg:.3f} Balanced_Acc {frame_balanced_acc.global_avg:.3f} '
-    #       'Auc {frame_auc.global_avg:.3f} EER {frame_eer.global_avg:.3f} loss {losses.global_avg:.3f}'
-    #       .format(frame_acc=metric_logger.frame_acc, frame_balanced_acc=metric_logger.frame_balanced_acc,
-    #               frame_auc=metric_logger.frame_auc, frame_eer=metric_logger.frame_eer, losses=metric_logger.loss))
     # video-level metrics:
     frame_labels_list = frame_labels.tolist()
     frame_preds_list = frame_preds.tolist()
-    video_label_list, video_pred_list, video_y_pred_list = get_video_level_label_pred(frame_labels_list, video_names_list, frame_preds_list)
-    # print(len(video_label_list), len(video_pred_list), len(video_y_pred_list))
-    # metric_logger.meters['video_acc'].update(video_level_acc(video_label_list, video_y_pred_list))
-    # metric_logger.meters['video_balanced_acc'].update(video_level_balanced_acc(video_label_list, video_y_pred_list))
-    # metric_logger.meters['video_auc'].update(video_level_auc(video_label_list, video_pred_list))
-    # metric_logger.meters['video_eer'].update(frame_level_eer(video_label_list, video_pred_list))
-    # print('*[------VIDEO-LEVEL------] \n'
-    #       'Acc {video_acc.global_avg:.3f} Balanced_Acc {video_balanced_acc.global_avg:.3f} '
-    #       'Auc {video_auc.global_avg:.3f} EER {video_eer.global_avg:.3f}'
-    #       .format(video_acc=metric_logger.video_acc, video_balanced_acc=metric_logger.video_balanced_acc,
-    #               video_auc=metric_logger.video_auc, video_eer=metric_logger.video_eer))
-    # return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
-    return frame_preds_list, video_pred_list

 # -*- coding: utf-8 -*-
+# Author: Gaojian Wang@ZJUICSR; TongWu@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.
 # --------------------------------------------------------
 import numpy as np
 import torch
+import torch.nn.functional as F
 import util.misc as misc
 from util.metrics import *
 @torch.no_grad()
+def test_two_class(data_loader, model, device):
     criterion = torch.nn.CrossEntropyLoss()
     # switch to evaluation mode
     model.eval()
     frame_labels = np.array([])  # int label
     frame_preds = np.array([])  # pred logit
     frame_y_preds = np.array([])  # pred int
+    video_names_list = list()
     for batch in data_loader:
         images = batch[0]  # torch.Size([BS, C, H, W])
         target = batch[1]  # torch.Size([BS])
+        video_name = batch[-1]  # list[BS]
         images = images.to(device, non_blocking=True)
         target = target.to(device, non_blocking=True)
+        output = model(images).to(device, non_blocking=True)  # modified
+        loss = criterion(output, target)
         frame_pred = (F.softmax(output, dim=1)[:, 1].detach().cpu().numpy())
         frame_preds = np.append(frame_preds, frame_pred)
         frame_y_pred = np.argmax(output.detach().cpu().numpy(), axis=1)
         frame_y_preds = np.append(frame_y_preds, frame_y_pred)
         frame_label = (target.detach().cpu().numpy())
         frame_labels = np.append(frame_labels, frame_label)
+        video_names_list.extend(list(video_name))
+    # video-level metrics:
+    frame_labels_list = frame_labels.tolist()
+    frame_preds_list = frame_preds.tolist()
+    video_label_list, video_pred_list, video_y_pred_list = get_video_level_label_pred(frame_labels_list, video_names_list, frame_preds_list)
+    return frame_preds_list, video_pred_list
 @torch.no_grad()
+def test_multi_class(data_loader, model, device):
     criterion = torch.nn.CrossEntropyLoss()
     # switch to evaluation mode
     model.eval()
     frame_labels = np.array([])  # int label
+    frame_preds = np.empty((0, 4))  # pred logit, initialize as 2D array with 4 columns for 4 classes
     frame_y_preds = np.array([])  # pred int
     video_names_list = list()
     for batch in data_loader:
         images = batch[0]  # torch.Size([BS, C, H, W])
         target = batch[1]  # torch.Size([BS])
         video_name = batch[-1]  # list[BS]
         images = images.to(device, non_blocking=True)
         target = target.to(device, non_blocking=True)
+        output = model(images).to(device, non_blocking=True)
         loss = criterion(output, target)
+        frame_pred = F.softmax(output, dim=1).detach().cpu().numpy()
+        frame_preds = np.append(frame_preds, frame_pred, axis=0)
         frame_y_pred = np.argmax(output.detach().cpu().numpy(), axis=1)
         frame_y_preds = np.append(frame_y_preds, frame_y_pred)
+        frame_label = target.detach().cpu().numpy()
         frame_labels = np.append(frame_labels, frame_label)
         video_names_list.extend(list(video_name))
     # video-level metrics:
     frame_labels_list = frame_labels.tolist()
     frame_preds_list = frame_preds.tolist()
+    video_label_list, video_pred_list, video_y_pred_list = get_video_level_label_pred_multi_class(frame_labels_list, video_names_list, frame_preds_list)
+    return frame_preds_list, video_pred_list
+# @torch.no_grad()
+# def test_multi_class(data_loader, model, device):
+#     criterion = torch.nn.CrossEntropyLoss()
+#
+#     # switch to evaluation mode
+#     model.eval()
+#
+#     frame_labels = np.array([])  # int label
+#     frame_preds = np.array([])  # pred logit
+#     frame_y_preds = np.array([])  # pred int
+#     video_names_list = list()
+#
+#     for batch in data_loader:
+#         images = batch[0]  # torch.Size([BS, C, H, W])
+#         target = batch[1]  # torch.Size([BS])
+#         video_name = batch[-1]  # list[BS]
+#         images = images.to(device, non_blocking=True)
+#         target = target.to(device, non_blocking=True)
+#
+#         output = model(images).to(device, non_blocking=True)
+#         loss = criterion(output, target)
+#
+#         frame_pred = F.softmax(output, dim=1).detach().cpu().numpy()
+#         frame_preds = np.append(frame_preds, frame_pred, axis=0)
+#         frame_y_pred = np.argmax(output.detach().cpu().numpy(), axis=1)
+#         frame_y_preds = np.append(frame_y_preds, frame_y_pred)
+#
+#         frame_label = target.detach().cpu().numpy()
+#         frame_labels = np.append(frame_labels, frame_label)
+#         video_names_list.extend(list(video_name))
+#
+#     # video-level metrics:
+#     frame_labels_list = frame_labels.tolist()
+#     frame_preds_list = frame_preds.tolist()
+#     video_label_list, video_pred_list, video_y_pred_list = get_video_level_label_pred_multi_class(frame_labels_list, video_names_list, frame_preds_list)
+#
+#     return frame_preds_list, video_pred_list

models_vit.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Author: Gaojian Wang@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.

 # -*- coding: utf-8 -*-
+# Author: Gaojian Wang@ZJUICSR; TongWu@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.

util/crop.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Author: Gaojian Wang@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.

 # -*- coding: utf-8 -*-
+# Author: Gaojian Wang@ZJUICSR; TongWu@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.

util/datasets.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Author: Gaojian Wang@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.

 # -*- coding: utf-8 -*-
+# Author: Gaojian Wang@ZJUICSR; TongWu@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.

util/lars.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Author: Gaojian Wang@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.

 # -*- coding: utf-8 -*-
+# Author: Gaojian Wang@ZJUICSR; TongWu@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.

util/loss_contrastive.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Author: Gaojian Wang@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.

 # -*- coding: utf-8 -*-
+# Author: Gaojian Wang@ZJUICSR; TongWu@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.

util/lr_decay.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Author: Gaojian Wang@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.

 # -*- coding: utf-8 -*-
+# Author: Gaojian Wang@ZJUICSR; TongWu@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.

util/lr_sched.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Author: Gaojian Wang@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.

 # -*- coding: utf-8 -*-
+# Author: Gaojian Wang@ZJUICSR; TongWu@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.

util/metrics.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Author: Gaojian Wang@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.
@@ -68,6 +68,34 @@ def get_video_level_label_pred(f_label_list, v_name_list, f_pred_list):
     return video_label_list, video_pred_list, video_y_pred_list
 def video_level_acc(video_label_list, video_y_pred_list):
     return accuracy_score(video_label_list, video_y_pred_list) * 100.

 # -*- coding: utf-8 -*-
+# Author: Gaojian Wang@ZJUICSR; TongWu@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.
     return video_label_list, video_pred_list, video_y_pred_list
+def get_video_level_label_pred_multi_class(f_label_list, v_name_list, f_pred_list):
+    import numpy as np
+    """
+    Adapted for multi-class predictions.
+    """
+    video_res_dict = dict()
+    video_pred_list = list()
+    video_y_pred_list = list()
+    video_label_list = list()
+    # Summarize all the results for each video
+    for label, video, score in zip(f_label_list, v_name_list, f_pred_list):
+        if video not in video_res_dict.keys():
+            video_res_dict[video] = {"scores": [score], "label": label}
+        else:
+            video_res_dict[video]["scores"].append(score)
+    # Get the score and label for each video
+    for video, res in video_res_dict.items():
+        avg_score = np.mean(res['scores'], axis=0)
+        label = res['label']
+        video_pred_list.append(avg_score)
+        video_label_list.append(label)
+        video_y_pred_list.append(np.argmax(avg_score))
+    return video_label_list, video_pred_list, video_y_pred_list
 def video_level_acc(video_label_list, video_y_pred_list):
     return accuracy_score(video_label_list, video_y_pred_list) * 100.

util/misc.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Author: Gaojian Wang@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.

 # -*- coding: utf-8 -*-
+# Author: Gaojian Wang@ZJUICSR; TongWu@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.

util/pos_embed.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Author: Gaojian Wang@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.

 # -*- coding: utf-8 -*-
+# Author: Gaojian Wang@ZJUICSR; TongWu@ZJUICSR
 # --------------------------------------------------------
 # This source code is licensed under the Attribution-NonCommercial 4.0 International License.
 # You can find the license in the LICENSE file in the root directory of this source tree.