File size: 2,405 Bytes
dbaf7c6
9dbb134
 
8fe7306
 
 
 
 
dbaf7c6
 
 
 
 
 
 
 
 
 
8fe7306
dbaf7c6
 
 
 
 
8fe7306
dbaf7c6
 
 
 
 
8fe7306
 
dbaf7c6
 
 
 
 
 
 
8fe7306
 
 
cf27cda
dbaf7c6
414dfdc
dd4d11e
b9def7b
 
 
1bc58c6
b9def7b
414dfdc
9dbb134
 
737b250
9dbb134
dbaf7c6
 
cf27cda
dbaf7c6
dd4d11e
b9def7b
 
 
 
1bc58c6
b9def7b
dd4d11e
9dbb134
 
737b250
9dbb134
8fe7306
 
 
 
dd4d11e
 
b9def7b
 
 
1bc58c6
b9def7b
dd4d11e
9dbb134
 
737b250
9dbb134
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
import json
from fastapi import APIRouter, Response
from pythainlp.tokenize import (
    word_tokenize as py_word_tokenize,
    subword_tokenize as py_subword_tokenize,
    sent_tokenize as py_sent_tokenize
)
from enum import Enum
from typing import List, Optional
from pydantic import BaseModel

router = APIRouter()


class SentTokenizeEngine(str, Enum):
    whitespace = "whitespace"
    whitespace_newline = "whitespace+newline"
    crfcut = "crfcut"


class WordTokenizeEngine(str, Enum):
    newmm = "newmm"
    longest = "longest"
    tltk = "tltk"


class SubwordTokenizeEngine(str, Enum):
    tcc = "tcc"
    etcc = "etcc"
    ssg = "ssg"
    tltk = "tltk"

class WordTokenizeResponse(BaseModel):
    words: List[str] = []

class SubwordTokenizeResponse(BaseModel):
    subwords: List[str] = []

class SentTokenizeEngine(BaseModel):
    sents: List[str] = []

@router.post('/word_tokenize', response_model=WordTokenizeResponse)
def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"):
    """
    Word tokenize or word segmentation for Thai language

    ## Input

    - **text**: Text that want to tokenize.
    - **engine**: Word Tokenize Engine (default is newmm)
    """
    return Response(
        json.dumps({"words": py_word_tokenize(text=text, engine=engine)}, ensure_ascii=False),
        media_type="application/json; charset=utf-8",
    )


@router.post('/subword_tokenize', response_model=SubwordTokenizeResponse)
def subword_tokenize(text: str, engine: SubwordTokenizeEngine = "tcc"):
    """
    Subword tokenize or subword segmentation for Thai language

    ## Input

    - **text**: Text that want to tokenize.
    - **engine**: Sub word Tokenize Engine (default is tcc)
    """
    return Response(
        json.dumps({"subwords": py_subword_tokenize(text=text, engine=engine)}, ensure_ascii=False),
        media_type="application/json; charset=utf-8",
    )


@router.post('/sent_tokenize', response_model=SentTokenizeEngine)
def sent_tokenize(text: str, engine: SentTokenizeEngine = "crfcut"):
    """
    Thai sentence segmentation

    ## Input

    - **text**: Text that want to tokenize.
    - **engine**: Sentence Tokenize Engine (default is crfcut)
    """
    return Response(
        json.dumps({"sents": py_sent_tokenize(text=text, engine=engine)}, ensure_ascii=False),
        media_type="application/json; charset=utf-8",
    )