Spaces:
Running
Running
import gradio as gr | |
import sentencepiece as spm | |
examples = [ | |
"Hello, world!", | |
"European Central bank has announced cuts.", | |
"This document is a summary of the European Public Assessment Report (EPAR).", | |
"En el presente documento se resume el Informe Público Europeo de Evaluación (EPAR).", | |
"Solution for injection", | |
"How is Abilify used?", | |
"¿Para qué se utiliza Abilify?", | |
"Tratado de la Unión Europea y Tratado de Funcionamiento de la Unión Europea"] | |
def greet(sentence): | |
sp_ecb = spm.SentencePieceProcessor() | |
sp_ecb.load('bpe-ECB.model') | |
sp_emea = spm.SentencePieceProcessor() | |
sp_emea.load('bpe-EMEA.model') | |
return ("<div class='output'>" + | |
"<div><b>ECB dataset</b></br>" + | |
("<span style='background-color: yellow;'> • </span>".join(sp_ecb.encode_as_pieces(sentence))) + | |
"</div>" + | |
"<div style='padding-top: 1em;'><b>EMEA dataset</b></br>" + | |
("<span style='background-color: yellow;'> • </span>".join(sp_emea.encode_as_pieces(sentence))) + | |
"</div>" + | |
"</div>") | |
description = """ | |
Demo for SentencePiece. The model is trained on ECB and EMEA datasets in order to see the differences in tokenization. | |
The ECB dataset contains financial news articles, while the EMEA dataset contains medical articles. | |
The texts included in the training are in English and Spanish, for this reason the tokenisation will work best for these languages. | |
You can try some other languages and see how the tokenisation works. However, make sure you use only Latin characters. | |
The model did not see any non-Latin characters during training, so the results for languages that do not use Latin characters will be unpredictable. | |
Both variants are trained with 5000 vocab size. | |
""" | |
demo = gr.Interface(fn=greet, inputs="text", outputs="html", | |
examples=examples, title="SentencePiece", | |
description=description, | |
cache_examples="lazy", | |
concurrency_limit=30, | |
css=".output {font-size: 150%;}") | |
demo.launch(share=True) | |