|
from config import LANGUAGE_IDENTIFICATION_LIBRARY |
|
|
|
module = LANGUAGE_IDENTIFICATION_LIBRARY.lower() |
|
|
|
|
|
def classify_language(text: str, target_languages: list = None) -> str: |
|
if module == "fastlid" or module == "fasttext": |
|
from fastlid import fastlid |
|
classifier = fastlid |
|
if target_languages != None: fastlid.set_languages = target_languages |
|
elif module == "langid": |
|
import langid |
|
classifier = langid.classify |
|
if target_languages != None: langid.set_languages(target_languages) |
|
else: |
|
raise ValueError(f"Wrong LANGUAGE_IDENTIFICATION_LIBRARY in config.py") |
|
|
|
lang = classifier(text)[0] |
|
|
|
return lang |
|
|
|
|
|
def classify_zh_ja(text: str) -> str: |
|
for idx, char in enumerate(text): |
|
unicode_val = ord(char) |
|
|
|
|
|
if 0x3040 <= unicode_val <= 0x309F or 0x30A0 <= unicode_val <= 0x30FF: |
|
return "ja" |
|
|
|
|
|
if 0x4E00 <= unicode_val <= 0x9FFF: |
|
|
|
next_char = text[idx + 1] if idx + 1 < len(text) else None |
|
|
|
if next_char and (0x3040 <= ord(next_char) <= 0x309F or 0x30A0 <= ord(next_char) <= 0x30FF): |
|
return "ja" |
|
|
|
return "zh" |
|
|
|
|
|
if __name__ == "__main__": |
|
text = "这是一个测试文本" |
|
print(classify_language(text)) |
|
print(classify_zh_ja(text)) |
|
|
|
text = "これはテストテキストです" |
|
print(classify_language(text)) |
|
print(classify_zh_ja(text)) |
|
|