VoiceStar / data /ll60k_preprocessing /step2_resplit_long.py
mrfakename's picture
Upload 51 files
82bc972 verified
# find split, spk, books in libriheavy_cuts_dev.jsonl, libriheavy_cuts_test_clean.jsonl, libriheavy_cuts_test_other.jsonl
# those would be in "id" field
import sys
import os, random, numpy as np, socket
import json
import tqdm
def write_jsonl(data, fn):
with open(fn, "w") as file:
for entry in data:
file.write(json.dumps(entry, ensure_ascii=False) + "\n")
def read_jsonl(file_path):
cur_data = []
with open(file_path, 'r', encoding='utf-8-sig') as file:
for line in file:
cur_data.append(json.loads(line.strip()))
return cur_data
import os
dataroot=os.environ["DATAROOT"]
manifestroot=os.path.join(dataroot, "libriheavy")
tgt_names = ['libriheavy_cuts_dev.jsonl', 'libriheavy_cuts_test_clean.jsonl', 'libriheavy_cuts_test_other.jsonl']
orig_names = ['libriheavy_long_original_cuts_small.jsonl', 'libriheavy_long_original_cuts_medium.jsonl', 'libriheavy_long_original_cuts_large.jsonl']
id2split = {}
data = read_jsonl(os.path.join(manifestroot, "libriheavy_cuts_dev.jsonl"))
dev_ids = set(["/".join(item['id'].split("/")[:3]) for item in data])
data = read_jsonl(os.path.join(manifestroot, "libriheavy_cuts_test_clean.jsonl"))
test_clean_ids = set(["/".join(item['id'].split("/")[:3]) for item in data])
data = read_jsonl(os.path.join(manifestroot, "libriheavy_cuts_test_other.jsonl"))
test_other_ids = set(["/".join(item['id'].split("/")[:3]) for item in data])
long_dev = []
long_test_clean = []
long_test_other = []
for orig_name in orig_names:
keep = []
data = read_jsonl(os.path.join(manifestroot, orig_name))
for item in tqdm.tqdm(data):
if "/".join(item['id'].split("/")[:3]) in dev_ids:
long_dev.append(item)
elif "/".join(item['id'].split("/")[:3]) in test_clean_ids:
long_test_clean.append(item)
elif "/".join(item['id'].split("/")[:3]) in test_other_ids:
long_test_other.append(item)
else:
keep.append(item)
write_jsonl(keep, os.path.join(manifestroot, orig_name.replace("_original", "")))
write_jsonl(long_dev, os.path.join(manifestroot, "libriheavy_long_cuts_dev.jsonl"))
write_jsonl(long_test_clean, os.path.join(manifestroot, "libriheavy_long_cuts_test_clean.jsonl"))
write_jsonl(long_test_other, os.path.join(manifestroot, "libriheavy_long_cuts_test_other.jsonl"))