Plan2Align-NV / laser /utils /src /sentence_split.py
KuangDW
Add laser2.spm using Git LFS
05d3571
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import logging
import re
import typing as tp
from pathlib import Path
from botok.tokenizers import sentencetokenizer as bod_sent_tok
# Indicp NLP
from indicnlp import common as indic_common
from indicnlp import loader as indic_loader
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from indicnlp.tokenize import sentence_tokenize as indic_sent_tok
from khmernltk import sentence_tokenize as khm_sent_tok
# pythainlp for Thai
# Seahorse for Indonesian, Thai, Vietnamese
# botok for tibetan
# Spacy for
# various tool-kits
from laonlp.tokenize import sent_tokenize as lao_sent_tok
# --- sentence splitters
# Moses-style
from sentence_splitter import SentenceSplitter
INDIC_NLP_RESOURCES = None # apparently not needed for splitting and normalization
logger = logging.getLogger("sentence_split")
split_lang_code_map = {
"ace_Arab" : "ace_Arab",
"ace_Latn" : "ace_Latn",
"acm_Arab" : "acm",
"acq_Arab" : "acq",
"aeb_Arab" : "aeb",
"afr_Latn" : "afr",
"ajp_Arab" : "ajp",
"aka_Latn" : "aka",
"amh_Ethi" : "amh",
"apc_Arab" : "apc",
"arb_Arab" : "ara",
"arb_Arab" : "ara_Arab",
"arb_Latn" : "ara_Latn",
"ars_Arab" : "ars",
"ary_Arab" : "ary",
"arz_Arab" : "arz",
"asm_Beng" : "asm",
"ast_Latn" : "ast",
"awa_Deva" : "awa",
"ayr_Latn" : "ayr",
"azb_Arab" : "azb",
"azj_Latn" : "azj",
"bak_Cyrl" : "bak",
"bam_Latn" : "bam",
"ban_Latn" : "ban",
"bel_Cyrl" : "bel",
"bem_Latn" : "bem",
"ben_Beng" : "ben",
"bho_Deva" : "bho",
"bjn_Arab" : "bjn_Arab",
"bjn_Latn" : "bjn_Latn",
"bod_Tibt" : "bod",
"bos_Latn" : "bos",
"bug_Latn" : "bug",
"bul_Cyrl" : "bul",
"cat_Latn" : "cat",
"ceb_Latn" : "ceb",
"ces_Latn" : "ces",
"cjk_Latn" : "cjk",
"ckb_Arab" : "ckb",
"crh_Latn" : "crh_Latn",
"cym_Latn" : "cym",
"dan_Latn" : "dan",
"deu_Latn" : "deu",
"dik_Latn" : "dik",
"diq_Latn" : "diq",
"dyu_Latn" : "dyu",
"dzo_Tibt" : "dzo",
"ell_Grek" : "ell",
"eng_Latn" : "eng",
"epo_Latn" : "epo",
"est_Latn" : "est",
"eus_Latn" : "eus",
"ewe_Latn" : "ewe",
"fao_Latn" : "fao",
"pes_Arab" : "fas",
"fij_Latn" : "fij",
"fin_Latn" : "fin",
"fon_Latn" : "fon",
"fra_Latn" : "fra",
"fur_Latn" : "fur",
"fuv_Latn" : "fuv",
"gla_Latn" : "gla",
"gle_Latn" : "gle",
"glg_Latn" : "glg",
"grn_Latn" : "grn",
"guj_Gujr" : "guj",
"hat_Latn" : "hat",
"hau_Latn" : "hau",
"heb_Hebr" : "heb",
"hin_Deva" : "hin",
"hne_Deva" : "hne",
"hrv_Latn" : "hrv",
"hun_Latn" : "hun",
"hye_Armn" : "hye",
"ibo_Latn" : "ibo",
"ilo_Latn" : "ilo",
"ind_Latn" : "ind",
"isl_Latn" : "isl",
"ita_Latn" : "ita",
"jav_Latn" : "jav",
"jpn_Jpan" : "jpn",
"kab_Latn" : "kab",
"kac_Latn" : "kac",
"kam_Latn" : "kam",
"kan_Knda" : "kan",
"kas_Arab" : "kas_Arab",
"kas_Deva" : "kas_Deva",
"kat_Geor" : "kat",
"knc_Arab" : "kau_Arab",
"knc_Latn" : "kau_Latn",
"kaz_Cyrl" : "kaz",
"kbp_Latn" : "kbp",
"kea_Latn" : "kea",
"khm_Khmr" : "khm",
"kik_Latn" : "kik",
"kin_Latn" : "kin",
"kir_Cyrl" : "kir",
"kmb_Latn" : "kmb",
"kon_Latn" : "kon",
"kor_Hang" : "kor",
"kmr_Latn" : "kur",
"lao_Laoo" : "lao",
"lvs_Latn" : "lav",
"lij_Latn" : "lij",
"lim_Latn" : "lim",
"lin_Latn" : "lin",
"lit_Latn" : "lit",
"lmo_Latn" : "lmo",
"ltg_Latn" : "ltg",
"ltz_Latn" : "ltz",
"lua_Latn" : "lua",
"lug_Latn" : "lug",
"luo_Latn" : "luo",
"lus_Latn" : "lus",
"mag_Deva" : "mag",
"mai_Deva" : "mai",
"mal_Mlym" : "mal",
"mar_Deva" : "mar",
"min_Arab" : "min_Arab",
"min_Latn" : "min_Latn",
"mkd_Cyrl" : "mkd",
"plt_Latn" : "mlg",
"mlt_Latn" : "mlt",
"khk_Cyrl" : "mon",
"mos_Latn" : "mos",
"mri_Latn" : "mri",
"zsm_Latn" : "msa",
"mya_Mymr" : "mya",
"nld_Latn" : "nld",
"nno_Latn" : "nno",
"nob_Latn" : "nob",
"npi_Deva" : "npi",
"nso_Latn" : "nso",
"nus_Latn" : "nus",
"nya_Latn" : "nya",
"oci_Latn" : "oci",
"gaz_Latn" : "orm",
"ory_Orya" : "ory",
"pag_Latn" : "pag",
"pan_Guru" : "pan",
"pap_Latn" : "pap",
"pol_Latn" : "pol",
"por_Latn" : "por",
"prs_Arab" : "prs",
"pbt_Arab" : "pus",
"quy_Latn" : "que",
"ron_Latn" : "ron",
"run_Latn" : "run",
"rus_Cyrl" : "rus",
"sag_Latn" : "sag",
"san_Deva" : "san",
"sat_Olck" : "sat",
"scn_Latn" : "scn",
"shn_Mymr" : "shn",
"sin_Sinh" : "sin",
"slk_Latn" : "slk",
"slv_Latn" : "slv",
"smo_Latn" : "smo",
"sna_Latn" : "sna",
"snd_Arab" : "snd",
"som_Latn" : "som",
"sot_Latn" : "sot",
"spa_Latn" : "spa",
"als_Latn" : "sqi",
"srd_Latn" : "srd",
"srp_Cyrl" : "srp_Cyrl",
"ssw_Latn" : "ssw",
"sun_Latn" : "sun",
"swe_Latn" : "swe",
"swh_Latn" : "swh",
"szl_Latn" : "szl",
"tam_Taml" : "tam",
"tat_Cyrl" : "tat_Cyrl",
"tel_Telu" : "tel",
"tgk_Cyrl" : "tgk",
"tgl_Latn" : "tgl",
"tha_Thai" : "tha",
"tir_Ethi" : "tir",
"taq_Latn" : "tmh_Latn",
"taq_Tfng" : "tmh_Tfng",
"ton_Latn" : "ton",
"tpi_Latn" : "tpi",
"tsn_Latn" : "tsn",
"tso_Latn" : "tso",
"tuk_Latn" : "tuk",
"tum_Latn" : "tum",
"tur_Latn" : "tur",
"twi_Latn" : "twi",
"tzm_Tfng" : "tzm",
"uig_Arab" : "uig",
"ukr_Cyrl" : "ukr",
"umb_Latn" : "umb",
"urd_Arab" : "urd",
"uzn_Latn" : "uzb",
"vec_Latn" : "vec",
"vie_Latn" : "vie",
"war_Latn" : "war",
"wol_Latn" : "wol",
"xho_Latn" : "xho",
"ydd_Hebr" : "yid",
"yor_Latn" : "yor",
"yue_Hant" : "yue",
"zho_Hans" : "zho_Hans",
"zho_Hant" : "zho_Hant",
"zul_Latn" : "zul"
}
# ----------------------------------
# Supported tokenization algorithms
# List of supported languages and mapping ISO3 - > ISO2
LANGS_MOSES = {
"cat": "ca",
"ces": "cs",
"dan": "da",
"nld": "nl",
"eng": "en",
"fin": "fi",
"fra": "fr",
"deu": "de",
"ell": "el",
"hun": "hu",
"isl": "is",
"ita": "it",
"lav": "lv",
"lit": "lt",
"nob": "no",
"pol": "pl",
"por": "pt",
"ron": "ro",
"rus": "ru",
"slk": "sk",
"slv": "sl",
"spa": "es",
"swe": "sv",
"tur": "tr",
}
LANGS_LAONLP = {"lao": "lao"}
LANGS_KHMER = {"khm": "khm"}
LANGS_BODNLP = {
"bod": "bod",
"dzo": "dzo",
} # languages with tibetan script
# ----------------------------------------------
LANGS_INDIC = {
"asm": "as",
"awa": "hi",
"ben": "bn",
"bho": "hi",
"brx": "bD",
"gom": "xx",
"guj": "gu",
"hin": "hi",
"hne": "hi",
"kan": "kn",
"kas": "hi",
"kas_Deva": "hi",
"kok": "kK",
"mni": "bn", # our meitei is in bengali script, so swapped it to bengali here
"mag": "hi",
"mai": "hi",
"mal": "ml",
"mar": "mr",
"npi": "ne",
"ory": "or",
"pan": "pa",
"san": "sa",
"snd": "sd",
"tam": "ta",
"tel": "te",
"urd": "ur",
}
# ----------------------------------------------
LANGS_GEEZ = {"amh": "amh", "tir": "tir"}
def split_geez(line: str) -> tp.Iterable[str]:
"""Split Amharic text into sentences."""
line = line.replace("፡፡", "።")
# remove "•" if there's already EOS marker before
line = (
line.replace("። •", "።")
.replace("? •", "?")
.replace("! •", "!")
.replace(". •", ".")
)
for sent in re.findall(r"[^።•!?\!\?\.]+[።•!?।৷\?\!\.]?", line, flags=re.U):
yield sent
# ----------------------------------------------
LANGS_OLCHIKI = {"san": "san"}
def split_olchiki(line: str) -> tp.Iterable[str]:
"""Split Santali text into sentences."""
for sent in re.findall(r"[^᱾|᱿!?\!\?]+[᱾|᱿!?\?\!]?", line, flags=re.U):
yield sent
# test sentence: ᱱᱤᱭᱟᱹ ᱣᱤᱠᱤᱯᱤᱰᱤᱭᱟ ᱫᱚ ᱥᱟᱱᱛᱟᱲᱤ ᱛᱮ ᱚᱞ ᱟᱠᱟᱱᱟ᱾ ᱚᱨᱦᱚᱸ ᱮᱴᱟᱜ ᱯᱟᱹᱨᱥᱤᱛᱮ ᱦᱚᱸ ᱟᱭᱢᱟ ᱣᱤᱠᱤᱯᱤᱰᱤᱭᱟ ᱢᱮᱱᱟᱜᱼᱟ ᱾ ᱱᱚᱸᱰᱮ ᱠᱤᱪᱷᱩ ᱛᱟᱹᱞᱠᱟᱹ ᱮᱢ ᱦᱩᱭᱱᱟ ᱾
# splits three times
# ----------------------------------------------
LANGS_BURMESE = {"mya": "mya", "shn": "shn"}
def split_burmese(line: str) -> tp.Iterable[str]:
"""Split Amharic text into sentences."""
# remove "•" if there's already EOS marker before
line = line.replace("။”", "APOS။")
for sent in re.findall(r"[^။!?\!\?\.]+[။!?।৷\?\!\.]?", line, flags=re.U):
yield sent.replace("APOS။", "။”")
# ----------------------------------
def get_split_algo(lang: str, split_algo: str) -> tp.Callable[[str], tp.Iterable[str]]:
if lang in split_lang_code_map:
lang = split_lang_code_map[lang]
# get default algorithm if requested
if split_algo == "default":
# use best algorithm in function of language
if lang in LANGS_MOSES:
split_algo = "moses"
elif lang in LANGS_INDIC:
split_algo = "indic"
elif lang in LANGS_GEEZ:
split_algo = "geez"
elif lang in LANGS_KHMER:
split_algo = "khmer"
elif lang in LANGS_BURMESE:
split_algo = "burmese"
else:
# use Moses by default (which likely will fall-back to English)
split_algo = "moses"
logger.info(f" - default algorithm for {lang} is {split_algo}")
if split_algo == "none" or lang == "TODO":
logger.info(" - no sentence splitting")
return lambda line: [line]
elif split_algo == "moses":
if lang in LANGS_MOSES:
lang = LANGS_MOSES[lang]
logger.info(f" - Moses sentence splitter: using rules for '{lang}'")
else:
lang = "en"
logger.info(
f" - Moses sentence splitter for {lang}: falling back to {lang} rules"
)
splitter = SentenceSplitter(language=lang)
# non_breaking_prefix_file=non_breaking_prefix_file
return splitter.split
elif split_algo == "indic":
# initialize toolkit (apparently not needed for sentence segmentation)
if INDIC_NLP_RESOURCES:
logger.info(" - Initialize Indic NLP toolkit")
indic_common.set_resources_path(INDIC_NLP_RESOURCES)
indic_loader.load()
if lang in LANGS_INDIC:
lang = LANGS_INDIC[lang]
logger.info(f" - Indic sentence splitter: using rules for '{lang}'")
else:
lang = "hi"
logger.info(
f" - Indic sentence splitter for {lang}: falling back to {lang} rules"
)
# setup normalizer
factory = IndicNormalizerFactory()
indic_normalizer = factory.get_normalizer(lang)
def split_indic(line: str) -> tp.Iterable[str]:
"""Split Indian text into sentences using Indic NLP tool."""
line = indic_normalizer.normalize(line)
for sent in indic_sent_tok.sentence_split(line, lang=lang):
yield sent
return split_indic
elif split_algo == "laonlp":
logger.info(f" - LaoNLP sentence splitter applied to '{lang}'")
return lao_sent_tok
elif split_algo == "khmer":
logger.info(f" - Khmer NLTK sentence splitter applied to '{lang}'")
return khm_sent_tok
elif split_algo == "bodnlp":
logger.info(f" - Tibetan NLTK sentence splitter applied to '{lang}'")
return bod_sent_tok
elif split_algo == "geez":
logger.info(f" - Ge'ez rule-based sentence splitter applied to '{lang}'")
return split_geez
elif split_algo == "burmese":
logger.info(f" - Burmese rule-based sentence splitter applied to '{lang}'")
return split_burmese
else:
logger.error(f"Unknown splitting algorithm {split_algo}")
return None