File size: 2,137 Bytes
e292284
8bf71a3
 
949e724
 
538dc0d
 
a514775
 
 
 
 
949e724
8bf71a3
 
538dc0d
 
 
 
 
 
 
 
 
 
 
 
 
 
e292284
 
a514775
 
 
 
 
 
 
 
 
805422f
a514775
 
949e724
d5b0375
c4dbf82
538dc0d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import gradio as gr
import sentencepiece as spm

examples = [
    "Hello, world!",
    "European Central bank has announced cuts.",
    "This document is a summary of the European Public Assessment Report (EPAR).",
    "En el presente documento se resume el Informe Público Europeo de Evaluación (EPAR).",
    "Solution for injection",
    "How is Abilify used?",
    "¿Para qué se utiliza Abilify?",
    "Tratado de la Unión Europea y Tratado de Funcionamiento de la Unión Europea"]


def greet(sentence):
    sp_ecb = spm.SentencePieceProcessor()
    sp_ecb.load('bpe-ECB.model')

    sp_emea = spm.SentencePieceProcessor()
    sp_emea.load('bpe-EMEA.model')

    return ("<div class='output'>" +
            "<div><b>ECB dataset</b></br>" +
            ("<span style='background-color: yellow;'> • </span>".join(sp_ecb.encode_as_pieces(sentence))) +
            "</div>" +
            "<div style='padding-top: 1em;'><b>EMEA dataset</b></br>" +
            ("<span style='background-color: yellow;'> • </span>".join(sp_emea.encode_as_pieces(sentence))) +
            "</div>" +
            "</div>")


description = """
Demo for SentencePiece. The model is trained on ECB and EMEA datasets in order to see the differences in tokenization.
The ECB dataset contains financial news articles, while the EMEA dataset contains medical articles. 
The texts included in the training are in English and Spanish, for this reason the tokenisation will work best for these languages.
You can try some other languages and see how the tokenisation works. However, make sure you use only Latin characters.
The model did not see any non-Latin characters during training, so the results for languages that do not use Latin characters will be unpredictable.
Both variants are trained with 5000 vocab size.
"""

demo = gr.Interface(fn=greet, inputs="text", outputs="html",
                    examples=examples, title="SentencePiece",
                    description=description,
                    cache_examples="lazy",
                    concurrency_limit=30,
                    css=".output {font-size: 150%;}")
demo.launch(share=True)