import json import ijson import random from tqdm import tqdm samples = [] file = open('all-cards-clean.json', 'r') # Parse the JSON array items one by one array_items = ijson.items(file, 'item') # Iterate over the JSON array items idx = 0 for item in array_items: idx += 1 if item.get('img') is None: continue desc_l = ( f'Title: {item.get("name")}\n' + (f'Cost: {item.get("details").get("mana_cost")}\n' if item.get("details").get("mana_cost") else '') + (f'Colors: {item.get("details").get("colors")}\n' if item.get("details").get("colors") else '') + f'Type: {item.get("details").get("type_line")}\n' + f'Desc: {item.get("details").get("oracle_text")}' ) samples.append({'uid': item.get('id'), 'sentence': desc_l, 'image': item.get('img')}) file.close() random.seed(420) data = [] for elem_l in samples: for elem_r in random.choices(samples, k=100): data.append({ 'uuid': f'{elem_l.get("uid")}_{elem_r.get("uid")}', 'sentence_1': elem_l.get('sentence'), 'sentence_2': elem_r.get('sentence'), 'image_1': elem_l.get('image'), 'image_2': elem_r.get('image'), }) with open('dataset.json', 'w') as f: json.dump(data, f) print(f'Generated {len(data)} samples from {len(samples)} elements.')