Spaces:
Sleeping
Sleeping
File size: 1,689 Bytes
1a7c2a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import gradio as gr
import torch
import pandas as pd
from transformers import BertTokenizer, BertModel
# 1. Load pretrained BERT (uncased) tokenizer & model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
def clean_and_embed(text: str):
# 2. Clean: remove non-ASCII, lowercase
clean = text.encode('ascii', 'ignore').decode().lower()
# 3. Tokenize + encode for PyTorch
inputs = tokenizer(clean, return_tensors='pt')
token_ids = inputs['input_ids'][0]
tokens = tokenizer.convert_ids_to_tokens(token_ids)
# 4. Get embeddings (last hidden state)
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state[0] # shape: (seq_len, hidden_size)
emb_np = embeddings.cpu().numpy()
# 5. Build a pandas DataFrame for display
# Rows → tokens, Columns → embedding dimensions
df = pd.DataFrame(
emb_np,
index=tokens,
columns=[f"dim_{i}" for i in range(emb_np.shape[1])]
)
# Return tokens list (as a single string) and DataFrame
return " ".join(tokens), df
# 6. Gradio interface
iface = gr.Interface(
fn=clean_and_embed,
inputs=gr.Textbox(lines=2, placeholder="Type your text here..."),
outputs=[
gr.Textbox(label="BERT Tokens"),
gr.Dataframe(label="Token Embeddings (one row per token)")
],
title="ASCII‑Cleaned → BERT Tokenizer & Embeddings",
description="Enter text to strip non‑ASCII chars, lowercase it, then view BERT tokens and their embeddings."
)
if __name__ == "__main__":
iface.launch()
|