Molbap HF Staff commited on
Commit
bd27c9a
·
verified ·
1 Parent(s): 5d68161

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +4 -95
index.html CHANGED
@@ -303,10 +303,6 @@ class GlmAttention(LlamaAttention):
303
  self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim,
304
  config.hidden_size, bias=False)
305
 
306
- def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
307
- # Slightly different RoPE
308
-
309
-
310
  class GlmForCausalLM(LlamaForCausalLM):
311
  pass
312
  </code></pre>
@@ -318,7 +314,7 @@ class GlmForCausalLM(LlamaForCausalLM):
318
  <p>All the code becomes runnable and a self-contained model definition</p>
319
  <pre><code class="language-python" data-trim>
320
 
321
- class GlmMLP(nn.Module):
322
  def __init__(self, config):
323
  super().__init__()
324
 
@@ -336,93 +332,6 @@ class GlmForCausalLM(LlamaForCausalLM):
336
  return self.down_proj(up_states)
337
 
338
 
339
- def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
340
- """
341
- This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
342
- num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
343
- """
344
- batch, num_key_value_heads, slen, head_dim = hidden_states.shape
345
- if n_rep == 1:
346
- return hidden_states
347
- hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
348
- return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
349
-
350
-
351
- def eager_attention_forward(
352
- module: nn.Module,
353
- query: torch.Tensor,
354
- key: torch.Tensor,
355
- value: torch.Tensor,
356
- attention_mask: Optional[torch.Tensor],
357
- scaling: float,
358
- dropout: float = 0.0,
359
- **kwargs,
360
- ):
361
- key_states = repeat_kv(key, module.num_key_value_groups)
362
- value_states = repeat_kv(value, module.num_key_value_groups)
363
-
364
- attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
365
- if attention_mask is not None:
366
- causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
367
- attn_weights = attn_weights + causal_mask
368
-
369
- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
370
- attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
371
- attn_output = torch.matmul(attn_weights, value_states)
372
- attn_output = attn_output.transpose(1, 2).contiguous()
373
-
374
- return attn_output, attn_weights
375
-
376
-
377
- def rotate_half(x):
378
- """Rotates half the hidden dims of the input."""
379
- x1 = x[..., 0::2]
380
- x2 = x[..., 1::2]
381
- return torch.stack((-x2, x1), dim=-1).flatten(-2)
382
-
383
-
384
- def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
385
- """Applies Rotary Position Embedding to the query and key tensors.
386
-
387
- Args:
388
- q (`torch.Tensor`): The query tensor.
389
- k (`torch.Tensor`): The key tensor.
390
- cos (`torch.Tensor`): The cosine part of the rotary embedding.
391
- sin (`torch.Tensor`): The sine part of the rotary embedding.
392
- position_ids (`torch.Tensor`, *optional*):
393
- Deprecated and unused.
394
- unsqueeze_dim (`int`, *optional*, defaults to 1):
395
- The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
396
- sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
397
- that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
398
- k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
399
- cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
400
- the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
401
- Returns:
402
- `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
403
- """
404
- cos = cos.unsqueeze(unsqueeze_dim)
405
- sin = sin.unsqueeze(unsqueeze_dim)
406
-
407
- # Interleave them instead of usual shape
408
- cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
409
- sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
410
-
411
- # Keep half or full tensor for later concatenation
412
- rotary_dim = cos.shape[-1]
413
- q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
414
- k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
415
-
416
- # Apply rotary embeddings on the first half or full tensor
417
- q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
418
- k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
419
-
420
- # Concatenate back to full shape
421
- q_embed = torch.cat([q_embed, q_pass], dim=-1)
422
- k_embed = torch.cat([k_embed, k_pass], dim=-1)
423
- return q_embed, k_embed
424
-
425
-
426
  class GlmAttention(nn.Module):
427
  """Multi-headed attention from 'Attention Is All You Need' paper"""
428
 
@@ -647,7 +556,7 @@ y = torch.empty_like(x)
647
  activation.gelu_fast(y, x)
648
  print(y)
649
  </code></pre>
650
- <p class="fragment">Same Transformer code — now with a <strong>3× faster</strong> GELU on A100s.</p>
651
  </section>
652
 
653
  <section>
@@ -722,8 +631,8 @@ model = AutoModelForConditionalGeneration.from_pretrained("Qwen/Qwen3-8B")
722
  🤝 Symbiotic Growth
723
  </p>
724
  <p style="display: flex; align-items: center; gap: 0.4rem; font-size: 1.4rem;">
725
- <img src="assets/torchlogo.png" alt="PyTorch" style="height: 1.4rem;" />
726
- PyTorch &amp; <code>transformers</code> grow together
727
  <img src="assets/head_logo.svg" alt="Transformers" style="height: 1.4rem;" />
728
  </p>
729
  </div>
 
303
  self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim,
304
  config.hidden_size, bias=False)
305
 
 
 
 
 
306
  class GlmForCausalLM(LlamaForCausalLM):
307
  pass
308
  </code></pre>
 
314
  <p>All the code becomes runnable and a self-contained model definition</p>
315
  <pre><code class="language-python" data-trim>
316
 
317
+ class GlmMLP(nn.Module):
318
  def __init__(self, config):
319
  super().__init__()
320
 
 
332
  return self.down_proj(up_states)
333
 
334
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  class GlmAttention(nn.Module):
336
  """Multi-headed attention from 'Attention Is All You Need' paper"""
337
 
 
556
  activation.gelu_fast(y, x)
557
  print(y)
558
  </code></pre>
559
+ <p>Same Transformer code — now with a <strong>3× faster</strong> GELU on A100s.</p>
560
  </section>
561
 
562
  <section>
 
631
  🤝 Symbiotic Growth
632
  </p>
633
  <p style="display: flex; align-items: center; gap: 0.4rem; font-size: 1.4rem;">
634
+ <img src="assets/transparent_PyTorch.png" alt="PyTorch" style="height: 1.4rem;" />
635
+ <code> PyTorch</code> &amp; <code>transformers</code> grow together
636
  <img src="assets/head_logo.svg" alt="Transformers" style="height: 1.4rem;" />
637
  </p>
638
  </div>