Spaces:
Sleeping
Sleeping
File size: 983 Bytes
6fc683c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
import json
import os
import traceback
from tqdm import tqdm
from multiprocessing import Pool
ROOT_FROM = 'XXX' # the path of laion-ocr-zip
ROOT_TO = 'XXX' # the path for saving dataset
MULTIPROCESSING_NUM = 64
DOWNLOAD_IMAGES = False # whether to download images from urls
def unzip_file(idx):
if not os.path.exists(f'{ROOT_FROM}/{idx}.zip') or os.path.exists(f'{ROOT_TO}/{idx}'):
return
cmd = f'unzip -q {ROOT_FROM}/{idx}.zip -d {ROOT_TO}'
os.system(cmd)
def multiprocess_unzip_file(idxs):
os.makedirs(ROOT_TO, exist_ok=True)
with Pool(processes=MULTIPROCESSING_NUM) as p:
with tqdm(total=len(idxs), desc='total') as pbar:
for i, _ in enumerate(p.imap_unordered(unzip_file, idxs)):
pbar.update()
print("multiprocess_unzip_file done!")
if __name__ == '__main__':
files = os.listdir(ROOT_FROM)
idxs = [str(idx[:-4]).zfill(5) for idx in files]
multiprocess_unzip_file(idxs)
print("Finished!")
|