Spaces:
Running
on
Zero
Running
on
Zero
# find split, spk, books in libriheavy_cuts_dev.jsonl, libriheavy_cuts_test_clean.jsonl, libriheavy_cuts_test_other.jsonl | |
# those would be in "id" field | |
import sys | |
import os, random, numpy as np, socket | |
import json | |
import tqdm | |
def write_jsonl(data, fn): | |
with open(fn, "w") as file: | |
for entry in data: | |
file.write(json.dumps(entry, ensure_ascii=False) + "\n") | |
def read_jsonl(file_path): | |
cur_data = [] | |
with open(file_path, 'r', encoding='utf-8-sig') as file: | |
for line in file: | |
cur_data.append(json.loads(line.strip())) | |
return cur_data | |
import os | |
dataroot=os.environ["DATAROOT"] | |
manifestroot=os.path.join(dataroot, "libriheavy") | |
tgt_names = ['libriheavy_cuts_dev.jsonl', 'libriheavy_cuts_test_clean.jsonl', 'libriheavy_cuts_test_other.jsonl'] | |
orig_names = ['libriheavy_long_original_cuts_small.jsonl', 'libriheavy_long_original_cuts_medium.jsonl', 'libriheavy_long_original_cuts_large.jsonl'] | |
id2split = {} | |
data = read_jsonl(os.path.join(manifestroot, "libriheavy_cuts_dev.jsonl")) | |
dev_ids = set(["/".join(item['id'].split("/")[:3]) for item in data]) | |
data = read_jsonl(os.path.join(manifestroot, "libriheavy_cuts_test_clean.jsonl")) | |
test_clean_ids = set(["/".join(item['id'].split("/")[:3]) for item in data]) | |
data = read_jsonl(os.path.join(manifestroot, "libriheavy_cuts_test_other.jsonl")) | |
test_other_ids = set(["/".join(item['id'].split("/")[:3]) for item in data]) | |
long_dev = [] | |
long_test_clean = [] | |
long_test_other = [] | |
for orig_name in orig_names: | |
keep = [] | |
data = read_jsonl(os.path.join(manifestroot, orig_name)) | |
for item in tqdm.tqdm(data): | |
if "/".join(item['id'].split("/")[:3]) in dev_ids: | |
long_dev.append(item) | |
elif "/".join(item['id'].split("/")[:3]) in test_clean_ids: | |
long_test_clean.append(item) | |
elif "/".join(item['id'].split("/")[:3]) in test_other_ids: | |
long_test_other.append(item) | |
else: | |
keep.append(item) | |
write_jsonl(keep, os.path.join(manifestroot, orig_name.replace("_original", ""))) | |
write_jsonl(long_dev, os.path.join(manifestroot, "libriheavy_long_cuts_dev.jsonl")) | |
write_jsonl(long_test_clean, os.path.join(manifestroot, "libriheavy_long_cuts_test_clean.jsonl")) | |
write_jsonl(long_test_other, os.path.join(manifestroot, "libriheavy_long_cuts_test_other.jsonl")) |