# Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import logging import re import typing as tp from pathlib import Path from botok.tokenizers import sentencetokenizer as bod_sent_tok # Indicp NLP from indicnlp import common as indic_common from indicnlp import loader as indic_loader from indicnlp.normalize.indic_normalize import IndicNormalizerFactory from indicnlp.tokenize import sentence_tokenize as indic_sent_tok from khmernltk import sentence_tokenize as khm_sent_tok # pythainlp for Thai # Seahorse for Indonesian, Thai, Vietnamese # botok for tibetan # Spacy for # various tool-kits from laonlp.tokenize import sent_tokenize as lao_sent_tok # --- sentence splitters # Moses-style from sentence_splitter import SentenceSplitter INDIC_NLP_RESOURCES = None # apparently not needed for splitting and normalization logger = logging.getLogger("sentence_split") split_lang_code_map = { "ace_Arab" : "ace_Arab", "ace_Latn" : "ace_Latn", "acm_Arab" : "acm", "acq_Arab" : "acq", "aeb_Arab" : "aeb", "afr_Latn" : "afr", "ajp_Arab" : "ajp", "aka_Latn" : "aka", "amh_Ethi" : "amh", "apc_Arab" : "apc", "arb_Arab" : "ara", "arb_Arab" : "ara_Arab", "arb_Latn" : "ara_Latn", "ars_Arab" : "ars", "ary_Arab" : "ary", "arz_Arab" : "arz", "asm_Beng" : "asm", "ast_Latn" : "ast", "awa_Deva" : "awa", "ayr_Latn" : "ayr", "azb_Arab" : "azb", "azj_Latn" : "azj", "bak_Cyrl" : "bak", "bam_Latn" : "bam", "ban_Latn" : "ban", "bel_Cyrl" : "bel", "bem_Latn" : "bem", "ben_Beng" : "ben", "bho_Deva" : "bho", "bjn_Arab" : "bjn_Arab", "bjn_Latn" : "bjn_Latn", "bod_Tibt" : "bod", "bos_Latn" : "bos", "bug_Latn" : "bug", "bul_Cyrl" : "bul", "cat_Latn" : "cat", "ceb_Latn" : "ceb", "ces_Latn" : "ces", "cjk_Latn" : "cjk", "ckb_Arab" : "ckb", "crh_Latn" : "crh_Latn", "cym_Latn" : "cym", "dan_Latn" : "dan", "deu_Latn" : "deu", "dik_Latn" : "dik", "diq_Latn" : "diq", "dyu_Latn" : "dyu", "dzo_Tibt" : "dzo", "ell_Grek" : "ell", "eng_Latn" : "eng", "epo_Latn" : "epo", "est_Latn" : "est", "eus_Latn" : "eus", "ewe_Latn" : "ewe", "fao_Latn" : "fao", "pes_Arab" : "fas", "fij_Latn" : "fij", "fin_Latn" : "fin", "fon_Latn" : "fon", "fra_Latn" : "fra", "fur_Latn" : "fur", "fuv_Latn" : "fuv", "gla_Latn" : "gla", "gle_Latn" : "gle", "glg_Latn" : "glg", "grn_Latn" : "grn", "guj_Gujr" : "guj", "hat_Latn" : "hat", "hau_Latn" : "hau", "heb_Hebr" : "heb", "hin_Deva" : "hin", "hne_Deva" : "hne", "hrv_Latn" : "hrv", "hun_Latn" : "hun", "hye_Armn" : "hye", "ibo_Latn" : "ibo", "ilo_Latn" : "ilo", "ind_Latn" : "ind", "isl_Latn" : "isl", "ita_Latn" : "ita", "jav_Latn" : "jav", "jpn_Jpan" : "jpn", "kab_Latn" : "kab", "kac_Latn" : "kac", "kam_Latn" : "kam", "kan_Knda" : "kan", "kas_Arab" : "kas_Arab", "kas_Deva" : "kas_Deva", "kat_Geor" : "kat", "knc_Arab" : "kau_Arab", "knc_Latn" : "kau_Latn", "kaz_Cyrl" : "kaz", "kbp_Latn" : "kbp", "kea_Latn" : "kea", "khm_Khmr" : "khm", "kik_Latn" : "kik", "kin_Latn" : "kin", "kir_Cyrl" : "kir", "kmb_Latn" : "kmb", "kon_Latn" : "kon", "kor_Hang" : "kor", "kmr_Latn" : "kur", "lao_Laoo" : "lao", "lvs_Latn" : "lav", "lij_Latn" : "lij", "lim_Latn" : "lim", "lin_Latn" : "lin", "lit_Latn" : "lit", "lmo_Latn" : "lmo", "ltg_Latn" : "ltg", "ltz_Latn" : "ltz", "lua_Latn" : "lua", "lug_Latn" : "lug", "luo_Latn" : "luo", "lus_Latn" : "lus", "mag_Deva" : "mag", "mai_Deva" : "mai", "mal_Mlym" : "mal", "mar_Deva" : "mar", "min_Arab" : "min_Arab", "min_Latn" : "min_Latn", "mkd_Cyrl" : "mkd", "plt_Latn" : "mlg", "mlt_Latn" : "mlt", "khk_Cyrl" : "mon", "mos_Latn" : "mos", "mri_Latn" : "mri", "zsm_Latn" : "msa", "mya_Mymr" : "mya", "nld_Latn" : "nld", "nno_Latn" : "nno", "nob_Latn" : "nob", "npi_Deva" : "npi", "nso_Latn" : "nso", "nus_Latn" : "nus", "nya_Latn" : "nya", "oci_Latn" : "oci", "gaz_Latn" : "orm", "ory_Orya" : "ory", "pag_Latn" : "pag", "pan_Guru" : "pan", "pap_Latn" : "pap", "pol_Latn" : "pol", "por_Latn" : "por", "prs_Arab" : "prs", "pbt_Arab" : "pus", "quy_Latn" : "que", "ron_Latn" : "ron", "run_Latn" : "run", "rus_Cyrl" : "rus", "sag_Latn" : "sag", "san_Deva" : "san", "sat_Olck" : "sat", "scn_Latn" : "scn", "shn_Mymr" : "shn", "sin_Sinh" : "sin", "slk_Latn" : "slk", "slv_Latn" : "slv", "smo_Latn" : "smo", "sna_Latn" : "sna", "snd_Arab" : "snd", "som_Latn" : "som", "sot_Latn" : "sot", "spa_Latn" : "spa", "als_Latn" : "sqi", "srd_Latn" : "srd", "srp_Cyrl" : "srp_Cyrl", "ssw_Latn" : "ssw", "sun_Latn" : "sun", "swe_Latn" : "swe", "swh_Latn" : "swh", "szl_Latn" : "szl", "tam_Taml" : "tam", "tat_Cyrl" : "tat_Cyrl", "tel_Telu" : "tel", "tgk_Cyrl" : "tgk", "tgl_Latn" : "tgl", "tha_Thai" : "tha", "tir_Ethi" : "tir", "taq_Latn" : "tmh_Latn", "taq_Tfng" : "tmh_Tfng", "ton_Latn" : "ton", "tpi_Latn" : "tpi", "tsn_Latn" : "tsn", "tso_Latn" : "tso", "tuk_Latn" : "tuk", "tum_Latn" : "tum", "tur_Latn" : "tur", "twi_Latn" : "twi", "tzm_Tfng" : "tzm", "uig_Arab" : "uig", "ukr_Cyrl" : "ukr", "umb_Latn" : "umb", "urd_Arab" : "urd", "uzn_Latn" : "uzb", "vec_Latn" : "vec", "vie_Latn" : "vie", "war_Latn" : "war", "wol_Latn" : "wol", "xho_Latn" : "xho", "ydd_Hebr" : "yid", "yor_Latn" : "yor", "yue_Hant" : "yue", "zho_Hans" : "zho_Hans", "zho_Hant" : "zho_Hant", "zul_Latn" : "zul" } # ---------------------------------- # Supported tokenization algorithms # List of supported languages and mapping ISO3 - > ISO2 LANGS_MOSES = { "cat": "ca", "ces": "cs", "dan": "da", "nld": "nl", "eng": "en", "fin": "fi", "fra": "fr", "deu": "de", "ell": "el", "hun": "hu", "isl": "is", "ita": "it", "lav": "lv", "lit": "lt", "nob": "no", "pol": "pl", "por": "pt", "ron": "ro", "rus": "ru", "slk": "sk", "slv": "sl", "spa": "es", "swe": "sv", "tur": "tr", } LANGS_LAONLP = {"lao": "lao"} LANGS_KHMER = {"khm": "khm"} LANGS_BODNLP = { "bod": "bod", "dzo": "dzo", } # languages with tibetan script # ---------------------------------------------- LANGS_INDIC = { "asm": "as", "awa": "hi", "ben": "bn", "bho": "hi", "brx": "bD", "gom": "xx", "guj": "gu", "hin": "hi", "hne": "hi", "kan": "kn", "kas": "hi", "kas_Deva": "hi", "kok": "kK", "mni": "bn", # our meitei is in bengali script, so swapped it to bengali here "mag": "hi", "mai": "hi", "mal": "ml", "mar": "mr", "npi": "ne", "ory": "or", "pan": "pa", "san": "sa", "snd": "sd", "tam": "ta", "tel": "te", "urd": "ur", } # ---------------------------------------------- LANGS_GEEZ = {"amh": "amh", "tir": "tir"} def split_geez(line: str) -> tp.Iterable[str]: """Split Amharic text into sentences.""" line = line.replace("፡፡", "።") # remove "•" if there's already EOS marker before line = ( line.replace("። •", "።") .replace("? •", "?") .replace("! •", "!") .replace(". •", ".") ) for sent in re.findall(r"[^።•!?\!\?\.]+[።•!?।৷\?\!\.]?", line, flags=re.U): yield sent # ---------------------------------------------- LANGS_OLCHIKI = {"san": "san"} def split_olchiki(line: str) -> tp.Iterable[str]: """Split Santali text into sentences.""" for sent in re.findall(r"[^᱾|᱿!?\!\?]+[᱾|᱿!?\?\!]?", line, flags=re.U): yield sent # test sentence: ᱱᱤᱭᱟᱹ ᱣᱤᱠᱤᱯᱤᱰᱤᱭᱟ ᱫᱚ ᱥᱟᱱᱛᱟᱲᱤ ᱛᱮ ᱚᱞ ᱟᱠᱟᱱᱟ᱾ ᱚᱨᱦᱚᱸ ᱮᱴᱟᱜ ᱯᱟᱹᱨᱥᱤᱛᱮ ᱦᱚᱸ ᱟᱭᱢᱟ ᱣᱤᱠᱤᱯᱤᱰᱤᱭᱟ ᱢᱮᱱᱟᱜᱼᱟ ᱾ ᱱᱚᱸᱰᱮ ᱠᱤᱪᱷᱩ ᱛᱟᱹᱞᱠᱟᱹ ᱮᱢ ᱦᱩᱭᱱᱟ ᱾ # splits three times # ---------------------------------------------- LANGS_BURMESE = {"mya": "mya", "shn": "shn"} def split_burmese(line: str) -> tp.Iterable[str]: """Split Amharic text into sentences.""" # remove "•" if there's already EOS marker before line = line.replace("။”", "APOS။") for sent in re.findall(r"[^။!?\!\?\.]+[။!?।৷\?\!\.]?", line, flags=re.U): yield sent.replace("APOS။", "။”") # ---------------------------------- def get_split_algo(lang: str, split_algo: str) -> tp.Callable[[str], tp.Iterable[str]]: if lang in split_lang_code_map: lang = split_lang_code_map[lang] # get default algorithm if requested if split_algo == "default": # use best algorithm in function of language if lang in LANGS_MOSES: split_algo = "moses" elif lang in LANGS_INDIC: split_algo = "indic" elif lang in LANGS_GEEZ: split_algo = "geez" elif lang in LANGS_KHMER: split_algo = "khmer" elif lang in LANGS_BURMESE: split_algo = "burmese" else: # use Moses by default (which likely will fall-back to English) split_algo = "moses" logger.info(f" - default algorithm for {lang} is {split_algo}") if split_algo == "none" or lang == "TODO": logger.info(" - no sentence splitting") return lambda line: [line] elif split_algo == "moses": if lang in LANGS_MOSES: lang = LANGS_MOSES[lang] logger.info(f" - Moses sentence splitter: using rules for '{lang}'") else: lang = "en" logger.info( f" - Moses sentence splitter for {lang}: falling back to {lang} rules" ) splitter = SentenceSplitter(language=lang) # non_breaking_prefix_file=non_breaking_prefix_file return splitter.split elif split_algo == "indic": # initialize toolkit (apparently not needed for sentence segmentation) if INDIC_NLP_RESOURCES: logger.info(" - Initialize Indic NLP toolkit") indic_common.set_resources_path(INDIC_NLP_RESOURCES) indic_loader.load() if lang in LANGS_INDIC: lang = LANGS_INDIC[lang] logger.info(f" - Indic sentence splitter: using rules for '{lang}'") else: lang = "hi" logger.info( f" - Indic sentence splitter for {lang}: falling back to {lang} rules" ) # setup normalizer factory = IndicNormalizerFactory() indic_normalizer = factory.get_normalizer(lang) def split_indic(line: str) -> tp.Iterable[str]: """Split Indian text into sentences using Indic NLP tool.""" line = indic_normalizer.normalize(line) for sent in indic_sent_tok.sentence_split(line, lang=lang): yield sent return split_indic elif split_algo == "laonlp": logger.info(f" - LaoNLP sentence splitter applied to '{lang}'") return lao_sent_tok elif split_algo == "khmer": logger.info(f" - Khmer NLTK sentence splitter applied to '{lang}'") return khm_sent_tok elif split_algo == "bodnlp": logger.info(f" - Tibetan NLTK sentence splitter applied to '{lang}'") return bod_sent_tok elif split_algo == "geez": logger.info(f" - Ge'ez rule-based sentence splitter applied to '{lang}'") return split_geez elif split_algo == "burmese": logger.info(f" - Burmese rule-based sentence splitter applied to '{lang}'") return split_burmese else: logger.error(f"Unknown splitting algorithm {split_algo}") return None