Spaces:

corasan
/

tokenisation

Running

File size: 2,137 Bytes

import gradio as gr
import sentencepiece as spm

examples = [
    "Hello, world!",
    "European Central bank has announced cuts.",
    "This document is a summary of the European Public Assessment Report (EPAR).",
    "En el presente documento se resume el Informe Público Europeo de Evaluación (EPAR).",
    "Solution for injection",
    "How is Abilify used?",
    "¿Para qué se utiliza Abilify?",
    "Tratado de la Unión Europea y Tratado de Funcionamiento de la Unión Europea"]


def greet(sentence):
    sp_ecb = spm.SentencePieceProcessor()
    sp_ecb.load('bpe-ECB.model')

    sp_emea = spm.SentencePieceProcessor()
    sp_emea.load('bpe-EMEA.model')

    return ("<div class='output'>" +
            "<div><b>ECB dataset</b></br>" +
            ("<span style='background-color: yellow;'> • </span>".join(sp_ecb.encode_as_pieces(sentence))) +
            "</div>" +
            "<div style='padding-top: 1em;'><b>EMEA dataset</b></br>" +
            ("<span style='background-color: yellow;'> • </span>".join(sp_emea.encode_as_pieces(sentence))) +
            "</div>" +
            "</div>")


description = """
Demo for SentencePiece. The model is trained on ECB and EMEA datasets in order to see the differences in tokenization.
The ECB dataset contains financial news articles, while the EMEA dataset contains medical articles. 
The texts included in the training are in English and Spanish, for this reason the tokenisation will work best for these languages.
You can try some other languages and see how the tokenisation works. However, make sure you use only Latin characters.
The model did not see any non-Latin characters during training, so the results for languages that do not use Latin characters will be unpredictable.
Both variants are trained with 5000 vocab size.
"""

demo = gr.Interface(fn=greet, inputs="text", outputs="html",
                    examples=examples, title="SentencePiece",
                    description=description,
                    cache_examples="lazy",
                    concurrency_limit=30,
                    css=".output {font-size: 150%;}")
demo.launch(share=True)