import gradio as gr import torch from transformers import BertTokenizer, BertModel # Load BERT (uncased) tokenizer & model tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") model = BertModel.from_pretrained("bert-base-uncased") def preprocess(text: str): # remove non‑ASCII characters and lowercase cleaned = text.encode("ascii", "ignore").decode().lower() # tokenize inputs = tokenizer(cleaned, return_tensors="pt") tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) # get embeddings with torch.no_grad(): outputs = model(**inputs) # last_hidden_state: [1, seq_len, hidden_size] → drop batch dim embeddings = outputs.last_hidden_state[0].tolist() return tokens, embeddings iface = gr.Interface( fn=preprocess, inputs=gr.Textbox(lines=3, placeholder="Type something…"), outputs=[ gr.Dataframe(headers=["token"], label="Tokens"), gr.Dataframe(label="Embeddings") ], title="BERT Tokenizer + Embeddings", description="Cleans input, lowercases it, then shows BERT tokens & their hidden‑state vectors." ) if __name__ == "__main__": iface.launch()