Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
import pandas as pd | |
# Interface utilisateur | |
st.set_page_config( | |
page_title="Traduction d'une phrase en pictogrammes ARASAAC", | |
page_icon="📝", | |
layout="wide" | |
) | |
# Charger le modèle et le tokenizer | |
checkpoint = "Propicto/t2p-t5-large-orfeo" | |
tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) | |
# Lire le lexique | |
def read_lexicon(lexicon): | |
df = pd.read_csv(lexicon, sep='\t') | |
df['keyword_no_cat'] = df['lemma'].str.split(' #').str[0].str.strip().str.replace(' ', '_') | |
return df | |
lexicon = read_lexicon("lexicon.csv") | |
# Processus de sortie de la traduction | |
def process_output_trad(pred): | |
return pred.split() | |
def get_id_picto_from_predicted_lemma(df_lexicon, lemma): | |
id_picto = df_lexicon.loc[df_lexicon['keyword_no_cat'] == lemma, 'id_picto'].tolist() | |
return (id_picto[0], lemma) if id_picto else (0, lemma) | |
# Génération du contenu HTML pour afficher les pictogrammes | |
def generate_html(ids): | |
html_content = '<html><head><style>' | |
html_content += ''' | |
figure { | |
display: inline-block; | |
text-align: center; | |
font-family: Arial, sans-serif; | |
} | |
figcaption { | |
color: black; | |
background-color: white; | |
border-radius: 1px; | |
} | |
img { | |
background-color: white; | |
margin: 1px; | |
padding: 0; | |
} | |
''' | |
html_content += '</style></head><body>' | |
for picto_id, lemma in ids: | |
if picto_id != 0: # ignore invalid IDs | |
img_url = f"https://static.arasaac.org/pictograms/{picto_id}/{picto_id}_500.png" | |
html_content += f''' | |
<figure> | |
<img src="{img_url}" alt="{lemma}" width="200" height="200"/> | |
<figcaption>{lemma}</figcaption> | |
</figure> | |
''' | |
html_content += '</body></html>' | |
return html_content | |
st.title("Traduction d'une phrase en pictogrammes ARASAAC") | |
sentence = st.text_input("Entrez une phrase en français:") | |
if sentence: | |
inputs = tokenizer(sentence, return_tensors="pt").input_ids | |
outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95) | |
pred = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
sentence_to_map = process_output_trad(pred) | |
pictogram_ids = [get_id_picto_from_predicted_lemma(lexicon, lemma) for lemma in sentence_to_map] | |
html = generate_html(pictogram_ids) | |
st.components.v1.html(html, height=800, scrolling=True) |