File size: 983 Bytes
6fc683c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import json
import os
import traceback
from tqdm import tqdm
from multiprocessing import Pool

ROOT_FROM = 'XXX' # the path of laion-ocr-zip
ROOT_TO = 'XXX' # the path for saving dataset
MULTIPROCESSING_NUM = 64
DOWNLOAD_IMAGES = False  # whether to download images from urls

def unzip_file(idx):
    if not os.path.exists(f'{ROOT_FROM}/{idx}.zip') or os.path.exists(f'{ROOT_TO}/{idx}'):
        return
    cmd = f'unzip -q {ROOT_FROM}/{idx}.zip -d {ROOT_TO}'
    os.system(cmd)


def multiprocess_unzip_file(idxs):
    os.makedirs(ROOT_TO, exist_ok=True)

    with Pool(processes=MULTIPROCESSING_NUM) as p:
        with tqdm(total=len(idxs), desc='total') as pbar:
            for i, _ in enumerate(p.imap_unordered(unzip_file, idxs)):
                pbar.update()
    print("multiprocess_unzip_file done!")

if __name__ == '__main__':
    files = os.listdir(ROOT_FROM)
    idxs = [str(idx[:-4]).zfill(5) for idx in files]
    multiprocess_unzip_file(idxs)
    print("Finished!")