card-labeler / upload_hf.py
Philipp Zettl
init commit
25e2635
raw
history blame contribute delete
1.35 kB
import json
import ijson
import random
from tqdm import tqdm
samples = []
file = open('all-cards-clean.json', 'r')
# Parse the JSON array items one by one
array_items = ijson.items(file, 'item')
# Iterate over the JSON array items
idx = 0
for item in array_items:
idx += 1
if item.get('img') is None:
continue
desc_l = (
f'Title: {item.get("name")}\n'
+ (f'Cost: {item.get("details").get("mana_cost")}\n' if item.get("details").get("mana_cost") else '')
+ (f'Colors: {item.get("details").get("colors")}\n' if item.get("details").get("colors") else '')
+ f'Type: {item.get("details").get("type_line")}\n'
+ f'Desc: {item.get("details").get("oracle_text")}'
)
samples.append({'uid': item.get('id'), 'sentence': desc_l, 'image': item.get('img')})
file.close()
random.seed(420)
data = []
for elem_l in samples:
for elem_r in random.choices(samples, k=100):
data.append({
'uuid': f'{elem_l.get("uid")}_{elem_r.get("uid")}',
'sentence_1': elem_l.get('sentence'),
'sentence_2': elem_r.get('sentence'),
'image_1': elem_l.get('image'),
'image_2': elem_r.get('image'),
})
with open('dataset.json', 'w') as f:
json.dump(data, f)
print(f'Generated {len(data)} samples from {len(samples)} elements.')