|
|
|
import json |
|
from fastapi import APIRouter, Response |
|
from fastapi.responses import JSONResponse |
|
from pythainlp.tokenize import ( |
|
word_tokenize as py_word_tokenize, |
|
subword_tokenize as py_subword_tokenize, |
|
sent_tokenize as py_sent_tokenize |
|
) |
|
from enum import Enum |
|
from typing import List, Optional |
|
from pydantic import BaseModel |
|
|
|
router = APIRouter() |
|
|
|
|
|
class SentTokenizeEngine(str, Enum): |
|
whitespace = "whitespace" |
|
whitespace_newline = "whitespace+newline" |
|
crfcut = "crfcut" |
|
|
|
|
|
class WordTokenizeEngine(str, Enum): |
|
newmm = "newmm" |
|
longest = "longest" |
|
tltk = "tltk" |
|
|
|
|
|
class SubwordTokenizeEngine(str, Enum): |
|
tcc = "tcc" |
|
etcc = "etcc" |
|
ssg = "ssg" |
|
tltk = "tltk" |
|
|
|
class WordTokenizeResponse(BaseModel): |
|
words: List[str] = [] |
|
|
|
class SubwordTokenizeResponse(BaseModel): |
|
subwords: List[str] = [] |
|
|
|
class SentTokenizeEngine(BaseModel): |
|
sents: List[str] = [] |
|
|
|
@router.post('/word_tokenize', response_model=WordTokenizeResponse) |
|
def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"): |
|
""" |
|
Word tokenize or word segmentation for Thai language |
|
|
|
## Input |
|
|
|
- **text**: Text that want to tokenize. |
|
- **engine**: Word Tokenize Engine (default is newmm) |
|
""" |
|
return JSONResponse( |
|
{"words": py_word_tokenize(text=text, engine=engine)}, |
|
media_type="application/json; charset=utf-8", |
|
) |
|
|
|
|
|
@router.post('/subword_tokenize', response_model=SubwordTokenizeResponse) |
|
def subword_tokenize(text: str, engine: SubwordTokenizeEngine = "tcc"): |
|
""" |
|
Subword tokenize or subword segmentation for Thai language |
|
|
|
## Input |
|
|
|
- **text**: Text that want to tokenize. |
|
- **engine**: Sub word Tokenize Engine (default is tcc) |
|
""" |
|
return JSONResponse( |
|
{"subwords": py_subword_tokenize(text=text, engine=engine)}, |
|
media_type="application/json; charset=utf-8", |
|
) |
|
|
|
|
|
@router.post('/sent_tokenize', response_model=SentTokenizeEngine) |
|
def sent_tokenize(text: str, engine: SentTokenizeEngine = "crfcut"): |
|
""" |
|
Thai sentence segmentation |
|
|
|
## Input |
|
|
|
- **text**: Text that want to tokenize. |
|
- **engine**: Sentence Tokenize Engine (default is crfcut) |
|
""" |
|
return JSONResponse( |
|
{"sents": py_sent_tokenize(text=text, engine=engine)}, |
|
media_type="application/json; charset=utf-8", |
|
) |
|
|