|
import gradio as gr |
|
import pandas as pd |
|
from sentence_transformers import SentenceTransformer |
|
|
|
|
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
def clean_and_embed(text: str): |
|
|
|
clean = text.encode('ascii', 'ignore').decode().lower() |
|
|
|
|
|
tokens = model.tokenizer.tokenize(clean) |
|
|
|
|
|
emb = model.encode(clean, convert_to_numpy=True) |
|
|
|
|
|
df = pd.DataFrame( |
|
[emb], |
|
index=["sentence_embedding"], |
|
columns=[f"dim_{i}" for i in range(emb.shape[0])] |
|
) |
|
|
|
return " ".join(tokens), df |
|
|
|
|
|
iface = gr.Interface( |
|
fn=clean_and_embed, |
|
inputs=gr.Textbox(lines=2, placeholder="Type your text here…"), |
|
outputs=[ |
|
gr.Textbox(label="Tokens"), |
|
gr.Dataframe(label="Sentence Embedding Vector") |
|
], |
|
title="ASCII‑Clean + SentenceTransformer", |
|
description="Cleans input, tokenizes with a SentenceTransformer tokenizer, and shows the sentence embedding." |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|