import json import os from concurrent.futures import ThreadPoolExecutor import fire from gtts import gTTS from tqdm import tqdm def update_vocab(vocab_path="data/vocab.json"): compact_json(vocab_path) generate_tts(vocab_path) conv_to_text(vocab_path) def compact_json( src_path="data/vocab.json", dst_path=None, group_size=10, ensure_ascii=False, indent=2, ): dst_path = dst_path or src_path with open(src_path, "rt", encoding="UTF-8") as fp: data = json.load(fp) data = [item for group in data for item in group] data = [data[i : i + group_size] for i in range(0, len(data), group_size)] with open(dst_path, "wt", encoding="UTF-8") as fp: json.dump(data, fp, cls=CompactEncoder, ensure_ascii=ensure_ascii, indent=indent) print(f"output: {dst_path}") class CompactEncoder(json.JSONEncoder): CONTAINER_TYPES = (list, tuple, dict) MAX_WIDTH = 100 MAX_ITEMS = 10 def __init__(self, *args, **kwargs): if kwargs.get("indent") is None: kwargs["indent"] = 4 super().__init__(*args, **kwargs) self.indentation_level = 0 def encode(self, o): if isinstance(o, (list, tuple)): return self._encode_list(o) if isinstance(o, dict): return self._encode_object(o) if isinstance(o, float): return format(o, "g") return json.dumps( o, skipkeys=self.skipkeys, ensure_ascii=self.ensure_ascii, check_circular=self.check_circular, allow_nan=self.allow_nan, sort_keys=self.sort_keys, indent=self.indent, separators=(self.item_separator, self.key_separator), default=self.default if hasattr(self, "default") else None, ) def _encode_list(self, o): if self._single_line(o): return "[" + ", ".join(self.encode(el) for el in o) + "]" self.indentation_level += 1 output = [self.indent_str + self.encode(el) for el in o] self.indentation_level -= 1 return "[\n" + ",\n".join(output) + "\n" + self.indent_str + "]" def _encode_object(self, o): if not o: return "{}" o = {str(k) if k is not None else "null": v for k, v in o.items()} if self.sort_keys: o = dict(sorted(o.items(), key=lambda x: x[0])) if self._single_line(o): return "{" + ", ".join(f"{self._create_kv(k,v)}" for k, v in o.items()) + "}" self.indentation_level += 1 output = [f"{self.indent_str}{self._create_kv(k,v)}" for k, v in o.items()] self.indentation_level -= 1 return "{\n" + ",\n".join(output) + "\n" + self.indent_str + "}" def _create_kv(self, k, v): return f"{json.dumps(k)}: {self.encode(v)}" def iterencode(self, o, **_): return self.encode(o) def _single_line(self, o): return ( self._primitives_only(o) and len(o) <= self.MAX_ITEMS and len(str(o)) - 2 <= self.MAX_WIDTH ) def _primitives_only(self, o: list | tuple | dict): if isinstance(o, (list, tuple)): return not any(isinstance(el, self.CONTAINER_TYPES) for el in o) elif isinstance(o, dict): return not any(isinstance(el, self.CONTAINER_TYPES) for el in o.values()) @property def indent_str(self) -> str: if isinstance(self.indent, int): return " " * (self.indentation_level * self.indent) elif isinstance(self.indent, str): return self.indentation_level * self.indent else: raise ValueError(f"indent must either be of type int or str (is: {type(self.indent)})") def generate_tts(src_path="data/vocab.json", output_dir="data/tts"): os.makedirs(output_dir, exist_ok=True) data = load_json(src_path) text_list = [item["kana"] for item_list in data for item in item_list] def generate_tts_worker(text): fp = os.path.join(output_dir, f"{text}.mp3") if os.path.exists(fp): return gTTS(text=text, lang="ja").save(fp) with tqdm(total=len(text_list), desc="generating tts") as pbar: with ThreadPoolExecutor() as executor: for _ in executor.map(generate_tts_worker, text_list): pbar.update(1) def conv_to_text(vocab_path, dst_path="data/vocab.txt"): vocab_list = load_json(vocab_path) lines = list() for group in vocab_list: for v in group: t = [t for t in (v["kana"], v["kanji"], v["meaning"]) if t] lines.append(" ".join(t)) lines.append("") with open(dst_path, "wt", encoding="UTF-8") as fp: fp.write("\n".join(lines)) def load_json(path): with open(path, "rt", encoding="UTF-8") as fp: return json.load(fp) if __name__ == "__main__": fire_map = dict(update=update_vocab, compact=compact_json, tts=generate_tts) fire.Fire(fire_map)