File size: 4,952 Bytes
ce88d40 845a877 ce88d40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
# preprocess by converting images into fingerprints and save to disk
import numpy as np
import cv2
import PIL.Image
from scipy.interpolate import griddata
import h5py
from utils import azi_diff
from tqdm import tqdm
import os
import random
import pickle
import logging
import joblib
def get_image_files(directory):
image_extensions = {'.jpg', '.jpeg', '.png'}
image_files = []
for root, _, files in os.walk(directory):
for file in files:
if os.path.splitext(file)[1].lower() in image_extensions:
image_files.append(os.path.join(root, file))
return image_files
def load_image_files(class1_dirs, class2_dirs):
class1_files = []
for directory in tqdm(class1_dirs):
class1_files.extend(get_image_files(directory))
class2_files = []
for directory in tqdm(class2_dirs):
class2_files.extend(get_image_files(directory))
# Ensure equal representation
min_length = min(len(class1_files), len(class2_files))
random.shuffle(class1_files)
random.shuffle(class2_files)
class1_files = class1_files[:min_length]
class2_files = class2_files[:min_length]
print(f"Number of files: Real = {len(class1_files)}, Fake = {len(class2_files)}")
return class1_files, class2_files
def process_and_save_h5(file_label_pairs, patch_num, N, save_interval, joblib_batch_size, output_dir, start_by=0):
def process_file(file_label):
path, label = file_label
try:
result = azi_diff(path, patch_num, N)
return result, label
except Exception as e:
logging.error(f"Error processing file {path}: {str(e)}")
return None, None
num_files = len(file_label_pairs)
num_saves = (num_files - start_by + save_interval - 1) // save_interval
if not os.path.exists(output_dir):
os.makedirs(output_dir)
with tqdm(total=num_files - start_by, desc="Processing files", unit="image") as pbar:
for save_index in range(num_saves):
save_start = start_by + save_index * save_interval
save_end = min(save_start + save_interval, num_files)
batch_pairs = file_label_pairs[save_start:save_end]
all_rich = []
all_poor = []
all_labels = []
for batch_start in range(0, len(batch_pairs), joblib_batch_size):
batch_end = min(batch_start + joblib_batch_size, len(batch_pairs))
small_batch_pairs = batch_pairs[batch_start:batch_end]
processed_data = joblib.Parallel(n_jobs=-1)(
joblib.delayed(process_file)(file_label) for file_label in small_batch_pairs
)
for data, label in processed_data:
if data is not None:
all_rich.append(data['total_emb'][0])
all_poor.append(data['total_emb'][1])
all_labels.append(label)
pbar.update(len(small_batch_pairs))
next_save_start = save_end
output_filename = f"{output_dir}/processed_data_{next_save_start}.h5"
logging.info(f"Saving {output_filename}")
with h5py.File(output_filename, 'w') as h5file:
h5file.create_dataset('rich', data=np.array(all_rich))
h5file.create_dataset('rich', data=np.array(all_poor))
h5file.create_dataset('labels', data=np.array(all_labels))
logging.info(f"Successfully saved {output_filename}")
del all_rich
del all_poor
del all_labels
load=False
class1_dirs = [
"/home/archive/real/",
"/home/13k_real/",
"/home/AI_detection_dataset/Real_AI_SD_LD_Dataset/train/real/",
"/home/AI_detection_dataset/Real_AI_SD_LD_Dataset/test/real/"
] #real 0
class2_dirs = [
"/home/archive/fakeV2/fake-v2/",
"/home/dalle3/",
"/home/AI_detection_dataset/Real_AI_SD_LD_Dataset/train/fake/",
"/home/AI_detection_dataset/Real_AI_SD_LD_Dataset/test/fake/"
] #fake 1
output_dir = "/content/drive/MyDrive/h5saves"
file_paths_pickle_save_dir='/content/drive/MyDrive/aigc_file_paths.pkl'
patch_num = 128
N = 256
save_interval = 2000
joblib_batch_size = 400
start_by = 0
if load==True:
with open(file_paths_pickle_save_dir, 'rb') as file:
file_label_pairs=pickle.load(file)
print(len(file_label_pairs))
else:
class1_files, class2_files = load_image_files(class1_dirs, class2_dirs)
file_label_pairs = list(zip(class1_files, [0] * len(class1_files))) + list(zip(class2_files, [1] * len(class2_files)))
random.shuffle(file_label_pairs)
with open(file_paths_pickle_save_dir, 'wb') as file:
pickle.dump(file_label_pairs, file)
print(len(file_label_pairs))
logging.basicConfig(level=logging.INFO)
process_and_save_h5(file_label_pairs, patch_num, N, save_interval, joblib_batch_size, output_dir, start_by)
|