File size: 2,338 Bytes
82bc972
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# find split, spk, books in libriheavy_cuts_dev.jsonl, libriheavy_cuts_test_clean.jsonl, libriheavy_cuts_test_other.jsonl
# those would be in "id" field

import sys
import os, random, numpy as np, socket
import json
import tqdm
def write_jsonl(data, fn):
    with open(fn, "w") as file:
        for entry in data:
            file.write(json.dumps(entry, ensure_ascii=False) + "\n")
def read_jsonl(file_path):
    cur_data = []
    with open(file_path, 'r', encoding='utf-8-sig') as file:
        for line in file:
            cur_data.append(json.loads(line.strip()))
    return cur_data
import os
dataroot=os.environ["DATAROOT"]
manifestroot=os.path.join(dataroot, "libriheavy")
tgt_names = ['libriheavy_cuts_dev.jsonl', 'libriheavy_cuts_test_clean.jsonl', 'libriheavy_cuts_test_other.jsonl']
orig_names = ['libriheavy_long_original_cuts_small.jsonl', 'libriheavy_long_original_cuts_medium.jsonl', 'libriheavy_long_original_cuts_large.jsonl']

id2split = {}
data = read_jsonl(os.path.join(manifestroot, "libriheavy_cuts_dev.jsonl"))
dev_ids = set(["/".join(item['id'].split("/")[:3]) for item in data])
data = read_jsonl(os.path.join(manifestroot, "libriheavy_cuts_test_clean.jsonl"))
test_clean_ids = set(["/".join(item['id'].split("/")[:3]) for item in data])
data = read_jsonl(os.path.join(manifestroot, "libriheavy_cuts_test_other.jsonl"))
test_other_ids = set(["/".join(item['id'].split("/")[:3]) for item in data])

long_dev = []
long_test_clean = []
long_test_other = []
for orig_name in orig_names:
    keep = []
    data = read_jsonl(os.path.join(manifestroot, orig_name))
    for item in tqdm.tqdm(data):
        if "/".join(item['id'].split("/")[:3]) in dev_ids:
            long_dev.append(item)
        elif "/".join(item['id'].split("/")[:3]) in test_clean_ids:
            long_test_clean.append(item)
        elif "/".join(item['id'].split("/")[:3]) in test_other_ids:
            long_test_other.append(item)
        else:
            keep.append(item)
    write_jsonl(keep, os.path.join(manifestroot, orig_name.replace("_original", "")))

write_jsonl(long_dev, os.path.join(manifestroot, "libriheavy_long_cuts_dev.jsonl"))
write_jsonl(long_test_clean, os.path.join(manifestroot, "libriheavy_long_cuts_test_clean.jsonl"))
write_jsonl(long_test_other, os.path.join(manifestroot, "libriheavy_long_cuts_test_other.jsonl"))