Update app.py
Browse files
app.py
CHANGED
@@ -9,28 +9,21 @@ from sklearn.decomposition import PCA
|
|
9 |
from transformers import AutoModel, AutoTokenizer, pipeline, AutoModelForCausalLM
|
10 |
|
11 |
# App Title
|
12 |
-
st.title("
|
13 |
st.markdown("""
|
14 |
-
|
15 |
""")
|
16 |
|
17 |
-
#
|
18 |
model_name = st.selectbox(
|
19 |
-
"
|
20 |
-
["
|
21 |
)
|
22 |
|
23 |
# Load Tokenizer & Model
|
24 |
st.write(f"Loading model: `{model_name}`...")
|
25 |
-
|
26 |
-
|
27 |
-
model = AutoModel.from_pretrained(model_name)
|
28 |
-
elif "whisper" in model_name:
|
29 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
30 |
-
model = AutoModel.from_pretrained(model_name)
|
31 |
-
elif "wav2vec2" in model_name:
|
32 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
33 |
-
model = AutoModel.from_pretrained(model_name)
|
34 |
|
35 |
# Display Model Details
|
36 |
st.subheader("π Model Details")
|
@@ -42,75 +35,57 @@ st.write(f"Total Parameters: `{sum(p.numel() for p in model.parameters())/1e6:.2
|
|
42 |
# Model Size Comparison
|
43 |
st.subheader("π Model Size Comparison")
|
44 |
model_sizes = {
|
45 |
-
"
|
46 |
-
"
|
47 |
-
"
|
48 |
}
|
49 |
-
df_size = pd.DataFrame(model_sizes.items(), columns=["Model", "Size (
|
50 |
-
fig = px.bar(df_size, x="Model", y="Size (
|
51 |
st.plotly_chart(fig)
|
52 |
|
53 |
# Tokenization Section
|
54 |
st.subheader("π Tokenization Visualization")
|
55 |
input_text = st.text_input("Enter Text:", "Hello, how are you?")
|
56 |
-
|
57 |
-
|
58 |
-
st.write("Note: Whisper is an audio model and doesn't use text tokenization")
|
59 |
-
st.write("Instead, it processes raw audio waveforms")
|
60 |
-
else:
|
61 |
-
tokens = tokenizer.tokenize(input_text)
|
62 |
-
st.write("Tokenized Output:", tokens)
|
63 |
|
64 |
# Token Embeddings Visualization (Fixed PCA Projection)
|
65 |
st.subheader("π§© Token Embeddings Visualization")
|
66 |
with torch.no_grad():
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
df_embeddings["Token"] = tokens
|
83 |
-
fig = px.scatter(df_embeddings, x="PCA1", y="PCA2", text="Token",
|
84 |
-
title="Token Embeddings (PCA Projection)")
|
85 |
-
st.plotly_chart(fig)
|
86 |
|
87 |
-
#
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
outputs = model(**inputs, output_attentions=True)
|
92 |
-
attention = outputs.attentions[-1].squeeze().detach().numpy()
|
93 |
-
fig, ax = plt.subplots(figsize=(10, 5))
|
94 |
-
sns.heatmap(attention[0], cmap="viridis", xticklabels=tokens, yticklabels=tokens, ax=ax)
|
95 |
-
st.pyplot(fig)
|
96 |
|
97 |
-
#
|
98 |
-
|
99 |
-
|
100 |
-
generator = pipeline("text-generation", model=model_name, return_full_text=False)
|
101 |
-
generated_output = generator(input_text, max_length=50, return_tensors=True)
|
102 |
-
st.write("Generated Output:", generated_output[0]["generated_text"])
|
103 |
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
st.plotly_chart(fig_prob)
|
115 |
|
116 |
-
st.markdown("π‘ *Explore more about
|
|
|
9 |
from transformers import AutoModel, AutoTokenizer, pipeline, AutoModelForCausalLM
|
10 |
|
11 |
# App Title
|
12 |
+
st.title("π Large Language Model Explorer")
|
13 |
st.markdown("""
|
14 |
+
Large Language models, their architectures, tokenization, and attention mechanisms.
|
15 |
""")
|
16 |
|
17 |
+
#Selection
|
18 |
model_name = st.selectbox(
|
19 |
+
"Select Large Language Model:",
|
20 |
+
["gpt-j-6b", "opt-175b", "bigscience/bloom-176b"]
|
21 |
)
|
22 |
|
23 |
# Load Tokenizer & Model
|
24 |
st.write(f"Loading model: `{model_name}`...")
|
25 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
26 |
+
model = AutoModel.from_pretrained(model_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
# Display Model Details
|
29 |
st.subheader("π Model Details")
|
|
|
35 |
# Model Size Comparison
|
36 |
st.subheader("π Model Size Comparison")
|
37 |
model_sizes = {
|
38 |
+
"gpt-j-6b": 6,
|
39 |
+
"opt-175b": 175,
|
40 |
+
"bigscience/bloom-176b": 176
|
41 |
}
|
42 |
+
df_size = pd.DataFrame(model_sizes.items(), columns=["Model", "Size (Billion Parameters)"])
|
43 |
+
fig = px.bar(df_size, x="Model", y="Size (Billion Parameters)", title="Model Size Comparison")
|
44 |
st.plotly_chart(fig)
|
45 |
|
46 |
# Tokenization Section
|
47 |
st.subheader("π Tokenization Visualization")
|
48 |
input_text = st.text_input("Enter Text:", "Hello, how are you?")
|
49 |
+
tokens = tokenizer.tokenize(input_text)
|
50 |
+
st.write("Tokenized Output:", tokens)
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
# Token Embeddings Visualization (Fixed PCA Projection)
|
53 |
st.subheader("π§© Token Embeddings Visualization")
|
54 |
with torch.no_grad():
|
55 |
+
inputs = tokenizer(input_text, return_tensors="pt")
|
56 |
+
outputs = model(**inputs)
|
57 |
+
if hasattr(outputs, "last_hidden_state"):
|
58 |
+
embeddings = outputs.last_hidden_state.squeeze(0).numpy()
|
59 |
+
# Ensure the number of tokens and embeddings match
|
60 |
+
n_tokens = min(len(tokens), embeddings.shape[0])
|
61 |
+
embeddings = embeddings[:n_tokens] # Trim embeddings to match token count
|
62 |
+
tokens = tokens[:n_tokens] # Trim tokens to match embeddings count
|
63 |
+
pca = PCA(n_components=2)
|
64 |
+
reduced_embeddings = pca.fit_transform(embeddings)
|
65 |
+
df_embeddings = pd.DataFrame(reduced_embeddings, columns=["PCA1", "PCA2"])
|
66 |
+
df_embeddings["Token"] = tokens
|
67 |
+
fig = px.scatter(df_embeddings, x="PCA1", y="PCA2", text="Token",
|
68 |
+
title="Token Embeddings (PCA Projection)")
|
69 |
+
st.plotly_chart(fig)
|
|
|
|
|
|
|
|
|
70 |
|
71 |
+
# Text Generation Demo
|
72 |
+
st.subheader("βοΈ Text Generation & Token Probabilities")
|
73 |
+
model_gen = AutoModelForCausalLM.from_pretrained(model_name)
|
74 |
+
generator = pipeline("text-generation", model=model_name, return_full_text=False)
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
+
# Generate text
|
77 |
+
generated_output = generator(input_text, max_length=50, return_tensors=True)
|
78 |
+
st.write("Generated Output:", generated_output[0]["generated_text"])
|
|
|
|
|
|
|
79 |
|
80 |
+
# Token Probability Visualization
|
81 |
+
with torch.no_grad():
|
82 |
+
inputs = tokenizer(input_text, return_tensors="pt")
|
83 |
+
logits = model_gen(**inputs).logits[:, -1, :]
|
84 |
+
probs = torch.nn.functional.softmax(logits, dim=-1).squeeze().detach().numpy()
|
85 |
+
top_tokens = np.argsort(probs)[-10:][::-1] # Top 10 tokens
|
86 |
+
token_probs = {tokenizer.decode([idx]): probs[idx] for idx in top_tokens}
|
87 |
+
df_probs = pd.DataFrame(token_probs.items(), columns=["Token", "Probability"])
|
88 |
+
fig_prob = px.bar(df_probs, x="Token", y="Probability", title="Top Token Predictions")
|
89 |
+
st.plotly_chart(fig_prob)
|
|
|
90 |
|
91 |
+
st.markdown("π‘ *Explore more about Large Language Models!*\n")
|