eaglelandsonce commited on
Commit
86ad0fb
·
verified ·
1 Parent(s): 4fca34d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -0
app.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import BertTokenizer, BertModel
4
+
5
+ # Load BERT (uncased) tokenizer & model
6
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
7
+ model = BertModel.from_pretrained("bert-base-uncased")
8
+
9
+ def preprocess(text: str):
10
+ # remove non‑ASCII characters and lowercase
11
+ cleaned = text.encode("ascii", "ignore").decode().lower()
12
+ # tokenize
13
+ inputs = tokenizer(cleaned, return_tensors="pt")
14
+ tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
15
+ # get embeddings
16
+ with torch.no_grad():
17
+ outputs = model(**inputs)
18
+ # last_hidden_state: [1, seq_len, hidden_size] → drop batch dim
19
+ embeddings = outputs.last_hidden_state[0].tolist()
20
+ return tokens, embeddings
21
+
22
+ iface = gr.Interface(
23
+ fn=preprocess,
24
+ inputs=gr.Textbox(lines=3, placeholder="Type something…"),
25
+ outputs=[
26
+ gr.Dataframe(headers=["token"], label="Tokens"),
27
+ gr.Dataframe(label="Embeddings")
28
+ ],
29
+ title="BERT Tokenizer + Embeddings",
30
+ description="Cleans input, lowercases it, then shows BERT tokens & their hidden‑state vectors."
31
+ )
32
+
33
+ if __name__ == "__main__":
34
+ iface.launch()
35
+