File size: 3,741 Bytes
4bb7f61
 
 
e43c159
4bb7f61
1f1c873
 
 
 
 
 
 
4bb7f61
be7919b
 
4bb7f61
 
 
 
e04e4a9
4bb7f61
 
 
 
 
 
 
 
 
 
 
 
0120ffb
 
4bb7f61
 
 
 
 
e25275e
 
 
 
 
 
0c84a8d
e25275e
 
6d3dcaa
 
26648f7
e25275e
fce03ed
 
1dd8382
798b1e0
26648f7
fce03ed
e25275e
 
4bb7f61
 
 
 
e25275e
fce03ed
4bb7f61
 
 
 
 
 
ae4476a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c4e31b
4bb7f61
 
 
 
 
 
 
 
 
 
ae4476a
 
e43c159
 
 
ae4476a
 
e43c159
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
from fpdf import FPDF

# Interface utilisateur
st.set_page_config(
    page_title="Traduction d'une phrase en pictogrammes ARASAAC",
    page_icon="📝",
    layout="wide"
)

# Charger le modèle et le tokenizer
# checkpoint = "Propicto/t2p-t5-large-orfeo"
checkpoint = "Propicto/t2p-nllb-200-distilled-600M-all"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# Lire le lexique
@st.cache_data
def read_lexicon(lexicon):
    df = pd.read_csv(lexicon, sep='\t')
    df['keyword_no_cat'] = df['lemma'].str.split(' #').str[0].str.strip().str.replace(' ', '_')
    return df

lexicon = read_lexicon("lexicon.csv")

# Processus de sortie de la traduction
def process_output_trad(pred):
    return pred.split()

def get_id_picto_from_predicted_lemma(df_lexicon, lemma):
    if lemma.endswith("!"):
        lemma = lemma[:-1]
    id_picto = df_lexicon.loc[df_lexicon['keyword_no_cat'] == lemma, 'id_picto'].tolist()
    return (id_picto[0], lemma) if id_picto else (0, lemma)

# Génération du contenu HTML pour afficher les pictogrammes
def generate_html(ids):
    html_content = '<html><head><style>'
    html_content += '''
        figure {
            display: inline-block;
            text-align: center;
            font-family: Arial, sans-serif;
            margin: 0;
        }
        figcaption {
            color: black;
            background-color: white;
            border-radius: 5px;
        }
        img {
            background-color: white;
            margin: 0;
            padding: 0;
            border-radius: 6px;
        }
    '''
    html_content += '</style></head><body>'
    for picto_id, lemma in ids:
        if picto_id != 0:  # ignore invalid IDs
            img_url = f"https://static.arasaac.org/pictograms/{picto_id}/{picto_id}_500.png"
            html_content += f'''
            <figure>
                <img src="{img_url}" alt="{lemma}" width="200" height="200"/>
                <figcaption>{lemma}</figcaption>
            </figure>
            '''
    html_content += '</body></html>'
    return html_content


def generate_pdf(ids):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_auto_page_break(auto=True, margin=15)
    
    for picto_id, lemma in ids:
        if picto_id != 0:  # ignore invalid IDs
            img_url = f"https://static.arasaac.org/pictograms/{picto_id}/{picto_id}_500.png"
            pdf.image(img_url, x=None, y=None, w=50, h=50)
            pdf.ln(55)
            pdf.set_font("Arial", size=12)
            pdf.cell(200, 10, txt=lemma, ln=True, align='C')
    
    pdf_path = "pictograms.pdf"
    pdf.output(pdf_path)
    return pdf_path



st.title("Traduction d'une phrase en pictogrammes ARASAAC")
sentence = st.text_input("Entrez une phrase en français:")
if sentence:
    inputs = tokenizer(sentence, return_tensors="pt").input_ids
    outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

    sentence_to_map = process_output_trad(pred)
    pictogram_ids = [get_id_picto_from_predicted_lemma(lexicon, lemma) for lemma in sentence_to_map]

    html = generate_html(pictogram_ids)
    st.components.v1.html(html, height=800, scrolling=True)

    # Container to hold the download button
    download_container = st.container()
    with download_container:
        pdf_path = generate_pdf(pictogram_ids)
        with open(pdf_path, "rb") as pdf_file:
            st.download_button(label="Télécharger la traduction en PDF", data=pdf_file, file_name="pictograms.pdf", mime="application/pdf")