Tzktz's picture
Upload 7664 files
6fc683c verified
# ------------------------------------------
# TextDiffuser: Diffusion Models as Text Painters
# Paper Link: https://arxiv.org/abs/2305.10855
# Code Link: https://github.com/microsoft/unilm/tree/master/textdiffuser
# Copyright (c) Microsoft Corporation.
# This file provides the inference script.
# ------------------------------------------
import json
import os
import numpy as np
import argparse
from clipscore import cal_clipscore
from fid_score import calculate_fid_given_paths
def eval_clipscore(root_eval, root_res, dataset, device="cuda:0", num_images_per_prompt=4):
with open(os.path.join(root_eval, dataset, dataset + '.txt'), 'r') as fr:
text_list = fr.readlines()
text_list = [_.strip() for _ in text_list]
clip_scores = []
scores = []
for seed in range(num_images_per_prompt):
if 'stablediffusion' in root_res:
format = '.png'
else:
format = '.jpg'
image_list = [os.path.join(root_res, dataset, 'images_' + str(seed),
str(idx) + '_' + str(seed) + format) for idx in range(len(text_list))]
image_ids = [str(idx) + '_' + str(seed) + format for idx in range(len(text_list))]
score = cal_clipscore(image_ids=image_ids, image_paths=image_list, text_list=text_list, device=device)
clip_score = np.mean([s['CLIPScore'] for s in score.values()])
clip_scores.append(clip_score)
scores.append(score)
print("clip_score:", np.mean(clip_scores), clip_scores)
return np.mean(clip_scores), scores
def MARIOEval_evaluate_results(root, datasets_with_images, datasets, methods, gpu,
eval_clipscore_flag=True, eval_fid_flag=True, num_images_per_prompt=4):
root_eval = os.path.join(root, "MARIOEval")
method_res = {}
device = "cuda:" + str(gpu)
for method_idx, method in enumerate(methods):
if method_idx != gpu: # running in different gpus simultaneously to save time
continue
print("\nmethod:", method)
dataset_res = {}
root_res = os.path.join(root, 'generation', method)
for dataset in datasets:
print("dataset:", dataset)
dataset_res[dataset] = {}
if eval_clipscore_flag:
dataset_res[dataset]['clipscore'], dataset_res[dataset]['scores'] =\
eval_clipscore(root_eval, root_res, dataset, device, num_images_per_prompt)
if eval_fid_flag and dataset in datasets_with_images:
gt_path = os.path.join(root_eval, dataset, 'images')
fids = []
for idx in range(num_images_per_prompt):
gen_path = os.path.join(root_res, dataset, 'images_' + str(idx))
fids.append(calculate_fid_given_paths(paths=[gt_path, gen_path]))
print("fid:", np.mean(fids), fids)
dataset_res[dataset]['fid'] = np.mean(fids)
if eval_clipscore_flag:
method_clipscores = []
for seed in range(num_images_per_prompt):
clipscore_list = []
for dataset in dataset_res.keys():
clipscore_list += [_['CLIPScore'] for _ in dataset_res[dataset]['scores'][seed].values()]
method_clipscores.append(np.mean(clipscore_list))
method_clipscore = np.mean(method_clipscores)
dataset_res['clipscore'] = method_clipscore
if eval_fid_flag:
method_fids = []
for idx in range(num_images_per_prompt):
gt_paths = []
gen_paths = []
for dataset in dataset_res.keys():
if dataset in datasets_with_images:
gt_paths.append(os.path.join(root_eval, dataset, 'images'))
gen_paths.append(os.path.join(root_res, dataset, 'images_' + str(idx)))
if len(gt_paths):
method_fids.append(calculate_fid_given_paths(paths=[gt_paths, gen_paths]))
print("fid:", np.mean(method_fids), method_fids)
method_fid = np.mean(method_fids)
dataset_res['fid'] = method_fid
method_res[method] = dataset_res
with open(os.path.join(root_res, 'eval.json'), 'w') as fw:
json.dump(dataset_res, fw)
print(method_res)
with open(os.path.join(root, 'generation', 'eval.json'), 'w') as fw:
json.dump(method_res, fw)
def merge_eval_results(root, methods):
method_res = {}
for method_idx, method in enumerate(methods):
root_res = os.path.join(root, 'generation', method)
with open(os.path.join(root_res, 'eval.json'), 'r') as fr:
dataset_res = json.load(fr)
for k, v in dataset_res.items():
if type(v) is dict:
del v['scores'] # too long
method_res[method] = dataset_res
with open(os.path.join(root, 'generation', 'eval.json'), 'w') as fw:
json.dump(method_res, fw)
def parse_args():
parser = argparse.ArgumentParser(description="Simple example of a training script.")
parser.add_argument(
"--dataset",
type=str,
default='TMDBEval500',
required=False,
choices=['TMDBEval500', 'OpenLibraryEval500', 'LAIONEval4000',
'ChineseDrawText', 'DrawBenchText', 'DrawTextCreative']
)
parser.add_argument(
"--root",
type=str,
default="/path/to/data/TextDiffuser/evaluation/",
required=True,
)
parser.add_argument(
"--method",
type=str,
default='controlnet',
required=False,
choices=['controlnet', 'deepfloyd', 'stablediffusion', 'textdiffuser']
)
parser.add_argument(
"--gpu",
type=int,
default=0,
required=False,
)
parser.add_argument(
"--split",
type=int,
default=0,
required=False,
)
parser.add_argument(
"--total_split",
type=int,
default=1,
required=False,
)
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
datasets_with_images = ['TMDBEval500', 'OpenLibraryEval500', 'LAIONEval4000']
datasets = datasets_with_images + ['ChineseDrawText', 'DrawBenchText', 'DrawTextCreative']
methods = ['textdiffuser', 'controlnet', 'deepfloyd', 'stablediffusion']
MARIOEval_evaluate_results(args.root, datasets_with_images, datasets, methods, args.gpu,
eval_clipscore_flag=True, eval_fid_flag=True, num_images_per_prompt=4)
merge_eval_results(args.root, methods)