borutokarma123 commited on
Commit
46d2acf
Β·
verified Β·
1 Parent(s): 21a66c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -72
app.py CHANGED
@@ -9,28 +9,21 @@ from sklearn.decomposition import PCA
9
  from transformers import AutoModel, AutoTokenizer, pipeline, AutoModelForCausalLM
10
 
11
  # App Title
12
- st.title(" Transformer Model Explorer")
13
  st.markdown("""
14
- Transformer models, their architectures, tokenization, and attention mechanisms will be displayed.
15
  """)
16
 
17
- # Select Transformer
18
  model_name = st.selectbox(
19
- "Choose a Transformer :",
20
- ["bigscience/bloom", "openai/whisper-base", "facebook/wav2vec2-base-960h"]
21
  )
22
 
23
  # Load Tokenizer & Model
24
  st.write(f"Loading model: `{model_name}`...")
25
- if "bloom" in model_name:
26
- tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
27
- model = AutoModel.from_pretrained(model_name)
28
- elif "whisper" in model_name:
29
- tokenizer = AutoTokenizer.from_pretrained(model_name)
30
- model = AutoModel.from_pretrained(model_name)
31
- elif "wav2vec2" in model_name:
32
- tokenizer = AutoTokenizer.from_pretrained(model_name)
33
- model = AutoModel.from_pretrained(model_name)
34
 
35
  # Display Model Details
36
  st.subheader("πŸ›  Model Details")
@@ -42,75 +35,57 @@ st.write(f"Total Parameters: `{sum(p.numel() for p in model.parameters())/1e6:.2
42
  # Model Size Comparison
43
  st.subheader("πŸ“Š Model Size Comparison")
44
  model_sizes = {
45
- "bigscience/bloom": 176,
46
- "openai/whisper-base": 74,
47
- "facebook/wav2vec2-base-960h": 317
48
  }
49
- df_size = pd.DataFrame(model_sizes.items(), columns=["Model", "Size (Million Parameters)"])
50
- fig = px.bar(df_size, x="Model", y="Size (Million Parameters)", title="Model Size Comparison")
51
  st.plotly_chart(fig)
52
 
53
  # Tokenization Section
54
  st.subheader("πŸ“ Tokenization Visualization")
55
  input_text = st.text_input("Enter Text:", "Hello, how are you?")
56
-
57
- if "whisper" in model_name:
58
- st.write("Note: Whisper is an audio model and doesn't use text tokenization")
59
- st.write("Instead, it processes raw audio waveforms")
60
- else:
61
- tokens = tokenizer.tokenize(input_text)
62
- st.write("Tokenized Output:", tokens)
63
 
64
  # Token Embeddings Visualization (Fixed PCA Projection)
65
  st.subheader("🧩 Token Embeddings Visualization")
66
  with torch.no_grad():
67
- if "whisper" in model_name:
68
- st.write("Note: Whisper uses a different embedding structure for audio features")
69
- st.write("Cannot directly visualize token embeddings as with text models")
70
- else:
71
- inputs = tokenizer(input_text, return_tensors="pt")
72
- outputs = model(**inputs)
73
- if hasattr(outputs, "last_hidden_state"):
74
- embeddings = outputs.last_hidden_state.squeeze(0).numpy()
75
- # Ensure the number of tokens and embeddings match
76
- n_tokens = min(len(tokens), embeddings.shape[0])
77
- embeddings = embeddings[:n_tokens] # Trim embeddings to match token count
78
- tokens = tokens[:n_tokens] # Trim tokens to match embeddings count
79
- pca = PCA(n_components=2)
80
- reduced_embeddings = pca.fit_transform(embeddings)
81
- df_embeddings = pd.DataFrame(reduced_embeddings, columns=["PCA1", "PCA2"])
82
- df_embeddings["Token"] = tokens
83
- fig = px.scatter(df_embeddings, x="PCA1", y="PCA2", text="Token",
84
- title="Token Embeddings (PCA Projection)")
85
- st.plotly_chart(fig)
86
 
87
- # Attention Visualization (for BERT & RoBERTa models)
88
- if "bloom" in model_name:
89
- st.subheader("πŸ” Attention Map")
90
- with torch.no_grad():
91
- outputs = model(**inputs, output_attentions=True)
92
- attention = outputs.attentions[-1].squeeze().detach().numpy()
93
- fig, ax = plt.subplots(figsize=(10, 5))
94
- sns.heatmap(attention[0], cmap="viridis", xticklabels=tokens, yticklabels=tokens, ax=ax)
95
- st.pyplot(fig)
96
 
97
- # Text Generation Demo (for BLOOM)
98
- if "bloom" in model_name:
99
- st.subheader("✍️ Text Generation & Token Probabilities")
100
- generator = pipeline("text-generation", model=model_name, return_full_text=False)
101
- generated_output = generator(input_text, max_length=50, return_tensors=True)
102
- st.write("Generated Output:", generated_output[0]["generated_text"])
103
 
104
- # Token Probability Visualization
105
- model_gen = AutoModelForCausalLM.from_pretrained(model_name)
106
- with torch.no_grad():
107
- inputs = tokenizer(input_text, return_tensors="pt")
108
- logits = model_gen(**inputs).logits[:, -1, :]
109
- probs = torch.nn.functional.softmax(logits, dim=-1).squeeze().detach().numpy()
110
- top_tokens = np.argsort(probs)[-10:][::-1] # Top 10 tokens
111
- token_probs = {tokenizer.decode([idx]): probs[idx] for idx in top_tokens}
112
- df_probs = pd.DataFrame(token_probs.items(), columns=["Token", "Probability"])
113
- fig_prob = px.bar(df_probs, x="Token", y="Probability", title="Top Token Predictions")
114
- st.plotly_chart(fig_prob)
115
 
116
- st.markdown("πŸ’‘ *Explore more about Transformer models!*")
 
9
  from transformers import AutoModel, AutoTokenizer, pipeline, AutoModelForCausalLM
10
 
11
  # App Title
12
+ st.title("πŸš€ Large Language Model Explorer")
13
  st.markdown("""
14
+ Large Language models, their architectures, tokenization, and attention mechanisms.
15
  """)
16
 
17
+ #Selection
18
  model_name = st.selectbox(
19
+ "Select Large Language Model:",
20
+ ["gpt-j-6b", "opt-175b", "bigscience/bloom-176b"]
21
  )
22
 
23
  # Load Tokenizer & Model
24
  st.write(f"Loading model: `{model_name}`...")
25
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
26
+ model = AutoModel.from_pretrained(model_name)
 
 
 
 
 
 
 
27
 
28
  # Display Model Details
29
  st.subheader("πŸ›  Model Details")
 
35
  # Model Size Comparison
36
  st.subheader("πŸ“Š Model Size Comparison")
37
  model_sizes = {
38
+ "gpt-j-6b": 6,
39
+ "opt-175b": 175,
40
+ "bigscience/bloom-176b": 176
41
  }
42
+ df_size = pd.DataFrame(model_sizes.items(), columns=["Model", "Size (Billion Parameters)"])
43
+ fig = px.bar(df_size, x="Model", y="Size (Billion Parameters)", title="Model Size Comparison")
44
  st.plotly_chart(fig)
45
 
46
  # Tokenization Section
47
  st.subheader("πŸ“ Tokenization Visualization")
48
  input_text = st.text_input("Enter Text:", "Hello, how are you?")
49
+ tokens = tokenizer.tokenize(input_text)
50
+ st.write("Tokenized Output:", tokens)
 
 
 
 
 
51
 
52
  # Token Embeddings Visualization (Fixed PCA Projection)
53
  st.subheader("🧩 Token Embeddings Visualization")
54
  with torch.no_grad():
55
+ inputs = tokenizer(input_text, return_tensors="pt")
56
+ outputs = model(**inputs)
57
+ if hasattr(outputs, "last_hidden_state"):
58
+ embeddings = outputs.last_hidden_state.squeeze(0).numpy()
59
+ # Ensure the number of tokens and embeddings match
60
+ n_tokens = min(len(tokens), embeddings.shape[0])
61
+ embeddings = embeddings[:n_tokens] # Trim embeddings to match token count
62
+ tokens = tokens[:n_tokens] # Trim tokens to match embeddings count
63
+ pca = PCA(n_components=2)
64
+ reduced_embeddings = pca.fit_transform(embeddings)
65
+ df_embeddings = pd.DataFrame(reduced_embeddings, columns=["PCA1", "PCA2"])
66
+ df_embeddings["Token"] = tokens
67
+ fig = px.scatter(df_embeddings, x="PCA1", y="PCA2", text="Token",
68
+ title="Token Embeddings (PCA Projection)")
69
+ st.plotly_chart(fig)
 
 
 
 
70
 
71
+ # Text Generation Demo
72
+ st.subheader("✍️ Text Generation & Token Probabilities")
73
+ model_gen = AutoModelForCausalLM.from_pretrained(model_name)
74
+ generator = pipeline("text-generation", model=model_name, return_full_text=False)
 
 
 
 
 
75
 
76
+ # Generate text
77
+ generated_output = generator(input_text, max_length=50, return_tensors=True)
78
+ st.write("Generated Output:", generated_output[0]["generated_text"])
 
 
 
79
 
80
+ # Token Probability Visualization
81
+ with torch.no_grad():
82
+ inputs = tokenizer(input_text, return_tensors="pt")
83
+ logits = model_gen(**inputs).logits[:, -1, :]
84
+ probs = torch.nn.functional.softmax(logits, dim=-1).squeeze().detach().numpy()
85
+ top_tokens = np.argsort(probs)[-10:][::-1] # Top 10 tokens
86
+ token_probs = {tokenizer.decode([idx]): probs[idx] for idx in top_tokens}
87
+ df_probs = pd.DataFrame(token_probs.items(), columns=["Token", "Probability"])
88
+ fig_prob = px.bar(df_probs, x="Token", y="Probability", title="Top Token Predictions")
89
+ st.plotly_chart(fig_prob)
 
90
 
91
+ st.markdown("πŸ’‘ *Explore more about Large Language Models!*\n")