File size: 2,405 Bytes
dbaf7c6 9dbb134 8fe7306 dbaf7c6 8fe7306 dbaf7c6 8fe7306 dbaf7c6 8fe7306 dbaf7c6 8fe7306 cf27cda dbaf7c6 414dfdc dd4d11e b9def7b 1bc58c6 b9def7b 414dfdc 9dbb134 737b250 9dbb134 dbaf7c6 cf27cda dbaf7c6 dd4d11e b9def7b 1bc58c6 b9def7b dd4d11e 9dbb134 737b250 9dbb134 8fe7306 dd4d11e b9def7b 1bc58c6 b9def7b dd4d11e 9dbb134 737b250 9dbb134 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
# -*- coding: utf-8 -*-
import json
from fastapi import APIRouter, Response
from pythainlp.tokenize import (
word_tokenize as py_word_tokenize,
subword_tokenize as py_subword_tokenize,
sent_tokenize as py_sent_tokenize
)
from enum import Enum
from typing import List, Optional
from pydantic import BaseModel
router = APIRouter()
class SentTokenizeEngine(str, Enum):
whitespace = "whitespace"
whitespace_newline = "whitespace+newline"
crfcut = "crfcut"
class WordTokenizeEngine(str, Enum):
newmm = "newmm"
longest = "longest"
tltk = "tltk"
class SubwordTokenizeEngine(str, Enum):
tcc = "tcc"
etcc = "etcc"
ssg = "ssg"
tltk = "tltk"
class WordTokenizeResponse(BaseModel):
words: List[str] = []
class SubwordTokenizeResponse(BaseModel):
subwords: List[str] = []
class SentTokenizeEngine(BaseModel):
sents: List[str] = []
@router.post('/word_tokenize', response_model=WordTokenizeResponse)
def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"):
"""
Word tokenize or word segmentation for Thai language
## Input
- **text**: Text that want to tokenize.
- **engine**: Word Tokenize Engine (default is newmm)
"""
return Response(
json.dumps({"words": py_word_tokenize(text=text, engine=engine)}, ensure_ascii=False),
media_type="application/json; charset=utf-8",
)
@router.post('/subword_tokenize', response_model=SubwordTokenizeResponse)
def subword_tokenize(text: str, engine: SubwordTokenizeEngine = "tcc"):
"""
Subword tokenize or subword segmentation for Thai language
## Input
- **text**: Text that want to tokenize.
- **engine**: Sub word Tokenize Engine (default is tcc)
"""
return Response(
json.dumps({"subwords": py_subword_tokenize(text=text, engine=engine)}, ensure_ascii=False),
media_type="application/json; charset=utf-8",
)
@router.post('/sent_tokenize', response_model=SentTokenizeEngine)
def sent_tokenize(text: str, engine: SentTokenizeEngine = "crfcut"):
"""
Thai sentence segmentation
## Input
- **text**: Text that want to tokenize.
- **engine**: Sentence Tokenize Engine (default is crfcut)
"""
return Response(
json.dumps({"sents": py_sent_tokenize(text=text, engine=engine)}, ensure_ascii=False),
media_type="application/json; charset=utf-8",
)
|