Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- assets/Pytorch_day_botleft.png +0 -0
- assets/fastload.png +0 -0
- assets/head_logo.svg +17 -0
- assets/linux foundation.png +0 -0
- assets/model_growth.html +0 -0
- assets/normalize_time_torch.webp +0 -0
- assets/py2.png +0 -0
- assets/screenpage.png +0 -0
- assets/screenpage2.png +0 -0
- assets/torchlogo.png +0 -0
- assets/visual_debugger.png +3 -0
- revamped_index.html +326 -0
- v0_index.html +314 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
assets/visual_debugger.png filter=lfs diff=lfs merge=lfs -text
|
assets/Pytorch_day_botleft.png
ADDED
![]() |
assets/fastload.png
ADDED
![]() |
assets/head_logo.svg
ADDED
|
assets/linux foundation.png
ADDED
![]() |
assets/model_growth.html
ADDED
The diff for this file is too large to render.
See raw diff
|
|
assets/normalize_time_torch.webp
ADDED
![]() |
assets/py2.png
ADDED
![]() |
assets/screenpage.png
ADDED
![]() |
assets/screenpage2.png
ADDED
![]() |
assets/torchlogo.png
ADDED
![]() |
assets/visual_debugger.png
ADDED
![]() |
Git LFS Details
|
revamped_index.html
ADDED
@@ -0,0 +1,326 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="utf-8" />
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
6 |
+
<title>PyTorch × Transformers Journey</title>
|
7 |
+
|
8 |
+
<!-- Google Fonts -->
|
9 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;800&family=Fira+Code:wght@400;600&display=swap" rel="stylesheet" />
|
10 |
+
|
11 |
+
<!-- Reveal.js core & dark theme base -->
|
12 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/reveal.js@5/dist/reset.css" />
|
13 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/reveal.js@5/dist/reveal.css" />
|
14 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/reveal.js@5/dist/theme/black.css" id="theme" />
|
15 |
+
|
16 |
+
<!-- Highlight.js -->
|
17 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/styles/github-dark.min.css" />
|
18 |
+
|
19 |
+
<!-- Animations -->
|
20 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/animate.css@4/animate.min.css" />
|
21 |
+
|
22 |
+
<style>
|
23 |
+
:root {
|
24 |
+
--accent-primary: #ee4c2c; /* PyTorch orange‑red */
|
25 |
+
--accent-secondary: #ffb347; /* lighter highlight */
|
26 |
+
--bg-gradient-start: #1b1b1b;
|
27 |
+
--bg-gradient-end: #242424;
|
28 |
+
}
|
29 |
+
html, body { font-family: 'Inter', sans-serif; }
|
30 |
+
.reveal .slides {
|
31 |
+
background: linear-gradient(135deg, var(--bg-gradient-start), var(--bg-gradient-end));
|
32 |
+
}
|
33 |
+
.reveal h1, .reveal h2, .reveal h3 { color: var(--accent-primary); font-weight: 800; letter-spacing: -0.5px; }
|
34 |
+
.reveal pre code { font-family: 'Fira Code', monospace; font-size: 0.75em; }
|
35 |
+
.reveal section img, .reveal section svg { border-radius: 1rem; box-shadow: 0 8px 22px rgba(0,0,0,0.4); }
|
36 |
+
.fragment.highlight-current-blue.visible { color: var(--accent-secondary) !important; }
|
37 |
+
/* slide-density patch */
|
38 |
+
.reveal h1 { font-size: 2.6rem; line-height: 1.1; }
|
39 |
+
.reveal h2 { font-size: 1.9rem; line-height: 1.15; }
|
40 |
+
.reveal h3 { font-size: 1.4rem; line-height: 1.2; }
|
41 |
+
.reveal p, .reveal li { font-size: 0.9rem; line-height: 1.35; }
|
42 |
+
.reveal pre code { font-size: 0.67em; }
|
43 |
+
@media (max-width: 1024px) { .reveal h1{font-size:2.2rem;} .reveal h2{font-size:1.6rem;} }
|
44 |
+
.reveal table td, .reveal table th { font-size: 0.85rem; padding: 4px 8px; }
|
45 |
+
</style>
|
46 |
+
</head>
|
47 |
+
<body>
|
48 |
+
<div class="reveal">
|
49 |
+
<div class="slides">
|
50 |
+
|
51 |
+
<!-- 1 · Opening -->
|
52 |
+
<section data-auto-animate>
|
53 |
+
<h1 class="animate__animated animate__fadeInDown">PyTorch × Transformers Journey</h1>
|
54 |
+
<h3 class="animate__animated animate__fadeInDown animate__delay-1s">Pythonicity, Autodiff & Modularity in Modern AI</h3>
|
55 |
+
<p class="animate__animated animate__fadeInUp animate__delay-2s">Pablo Montalvo‑Leroux · ML Engineer @ Hugging Face</p>
|
56 |
+
</section>
|
57 |
+
|
58 |
+
<!-- 2 · 2016: Backprop & Birth Pangs -->
|
59 |
+
<section>
|
60 |
+
<h2>2016‑2018: Backprop & Birth Pangs</h2>
|
61 |
+
<ul>
|
62 |
+
<li>Hand‑crafted chain‑rule; frameworks such as Theano and CNTK appeared then vanished.</li>
|
63 |
+
<li>MLPs → RNNs → LSTMs — until <strong>BERT</strong> detonated the field in 2018.</li>
|
64 |
+
<li class="fragment">Reproducibility was painful ✗ — until Transformers met PyTorch ✓.</li>
|
65 |
+
</ul>
|
66 |
+
</section>
|
67 |
+
|
68 |
+
<!-- 3 · Static vs Dynamic Graphs -->
|
69 |
+
<section>
|
70 |
+
<h2>Static vs Dynamic Graphs</h2>
|
71 |
+
<p class="fragment">Static graphs require you to compile, wait, and cross fingers the bug reproduces.</p>
|
72 |
+
<p class="fragment">Dynamic graphs mean you can drop <code>pdb.set_trace()</code> anywhere and continue iterating.</p>
|
73 |
+
<p class="fragment"><code>torch.compile</code> gives the best of both worlds: write dynamically, ship something ahead‑of‑time optimised.</p>
|
74 |
+
</section>
|
75 |
+
|
76 |
+
<!-- 4 · Dynamic Graphs Enabled Contribution -->
|
77 |
+
<section>
|
78 |
+
<h2>Dynamic Graphs Enabled Contribution</h2>
|
79 |
+
<ul>
|
80 |
+
<li>Developers debug at line‑rate — no cold‑start recompiles.</li>
|
81 |
+
<li>Pull‑requests remained reproducible overnight, which accelerated trust.</li>
|
82 |
+
<li>Static‑graph alternatives stalled and the community consolidated around PyTorch.</li>
|
83 |
+
</ul>
|
84 |
+
</section>
|
85 |
+
|
86 |
+
<!-- 5 · Paper Tonight → Tweak Tomorrow -->
|
87 |
+
<section>
|
88 |
+
<h2>Clone the Paper Tonight → Tweak Tomorrow</h2>
|
89 |
+
<p>Research cadence is measured in <strong>hours</strong>; any friction kills momentum.</p>
|
90 |
+
<ul>
|
91 |
+
<li class="fragment">2018: BERT fine‑tuning required printing tensors live rather than recompiling graphs.</li>
|
92 |
+
<li class="fragment">Community PRs merged overnight — credibility snowballed for both PyTorch and Transformers.</li>
|
93 |
+
</ul>
|
94 |
+
</section>
|
95 |
+
|
96 |
+
<!-- 6 · One Model · One File -->
|
97 |
+
<section>
|
98 |
+
<h2>“One Model · One File” — Why it Matters</h2>
|
99 |
+
<pre><code class="language-python" data-trim data-noescape>
|
100 |
+
# modeling_bert.py — single source of truth 🗄️
|
101 |
+
class BertConfig(PretrainedConfig):
|
102 |
+
...
|
103 |
+
|
104 |
+
class BertSelfAttention(nn.Module):
|
105 |
+
...
|
106 |
+
|
107 |
+
class BertLayer(nn.Module):
|
108 |
+
...
|
109 |
+
|
110 |
+
class BertModel(PreTrainedModel):
|
111 |
+
def __init__(self, config):
|
112 |
+
super().__init__(config)
|
113 |
+
self.embeddings = BertEmbeddings(config)
|
114 |
+
self.encoder = nn.ModuleList(
|
115 |
+
[BertLayer(config) for _ in range(config.num_hidden_layers)]
|
116 |
+
)
|
117 |
+
self.init_weights()
|
118 |
+
</code></pre>
|
119 |
+
<ul>
|
120 |
+
<li>All layers, forward pass, and <code>from_pretrained()</code> logic live together.</li>
|
121 |
+
<li>No cross‑file inheritance maze — copy to Colab, hack, and run.</li>
|
122 |
+
<li>Reviewers diff one file; merge time dropped from days to hours.</li>
|
123 |
+
</ul>
|
124 |
+
</section>
|
125 |
+
|
126 |
+
<!-- 7 · Transformers Grew With Python -->
|
127 |
+
<section>
|
128 |
+
<h2>Transformers Grew with Python</h2>
|
129 |
+
<ul>
|
130 |
+
<li>The library prioritises hackability, which in turn accelerates adoption.</li>
|
131 |
+
<li>Python is slow by default, so we lean on compiled CUDA kernels and Triton for raw speed.</li>
|
132 |
+
<li>The new <strong>Kernel Hub</strong> means Transformers automatically uses a faster op the moment it is published — no application changes required.</li>
|
133 |
+
</ul>
|
134 |
+
</section>
|
135 |
+
|
136 |
+
<!-- 8 · Back to Python: Mary Shelley Mode -->
|
137 |
+
<section>
|
138 |
+
<h2>Back to Python: Modular “Mary Shelley” Mode</h2>
|
139 |
+
<p>Compose new blocks via subclassing and selective override.</p>
|
140 |
+
<pre><code class="language-python" data-trim data-noescape>
|
141 |
+
class LlamaRotaryLoRA(LlamaAttention):
|
142 |
+
def __init__(...):
|
143 |
+
super().__init__(...)
|
144 |
+
self.q_proj = LoRA(self.q_proj) # swap in LoRA
|
145 |
+
self.apply_rotary() # keep RoPE
|
146 |
+
</code></pre>
|
147 |
+
</section>
|
148 |
+
|
149 |
+
<!-- 9 · Logit Debugger -->
|
150 |
+
<section>
|
151 |
+
<h2>Logit Debugger: Trust but Verify</h2>
|
152 |
+
<ul>
|
153 |
+
<li>Attach a hook to every <code>nn.Module</code>; dump logits layer‑by‑layer.</li>
|
154 |
+
<li>Spot ε‑level drifts — LayerNorm precision, FP16 underflow, etc.</li>
|
155 |
+
<li>JSON traces are diffable in CI, so regressions stay caught.</li>
|
156 |
+
</ul>
|
157 |
+
</section>
|
158 |
+
|
159 |
+
<!-- 10 · DTensor & TP API -->
|
160 |
+
<section>
|
161 |
+
<h2>DTensor & Tensor‑Parallel API</h2>
|
162 |
+
<ul>
|
163 |
+
<li>Logical tensor views unlock device‑mesh sharding.</li>
|
164 |
+
<li>The <code>tp_plan</code> JSON keeps model code pristine and declarative.</li>
|
165 |
+
<li>We regularly validate 100‑billion‑parameter checkpoints inside HF test infra.</li>
|
166 |
+
</ul>
|
167 |
+
<img data-src="assets/mesh.svg" alt="Device mesh" />
|
168 |
+
</section>
|
169 |
+
|
170 |
+
<!-- 11 · Zero‑Config Parallelism -->
|
171 |
+
<section>
|
172 |
+
<h2>Zero‑Config Parallelism</h2>
|
173 |
+
<pre><code class="language-json" data-trim data-noescape>{
|
174 |
+
"layer.*.self_attn.q_proj": "colwise",
|
175 |
+
"layer.*.self_attn.k_proj": "colwise",
|
176 |
+
"layer.*.self_attn.v_proj": "colwise",
|
177 |
+
"layer.*.self_attn.o_proj": "rowwise"
|
178 |
+
}</code></pre>
|
179 |
+
<pre><code class="language-python" data-trim data-noescape>
|
180 |
+
def translate_to_torch_parallel_style(style: str):
|
181 |
+
if style == "colwise":
|
182 |
+
return ColwiseParallel()
|
183 |
+
elif style == "rowwise":
|
184 |
+
return RowwiseParallel()
|
185 |
+
</code></pre>
|
186 |
+
<p class="fragment">One JSON file loads a 17‑billion‑parameter Llama‑4 on 8 GPUs; tweak the plan, not the network.</p>
|
187 |
+
</section>
|
188 |
+
|
189 |
+
<!-- 12 · Cache Allocator -->
|
190 |
+
<section>
|
191 |
+
<h2>Load Faster & Stronger: Cache Allocator</h2>
|
192 |
+
<p>Zero‑copy weight sharding shaves <strong>15 %</strong> VRAM on A100 while cutting load time below 60 s for a 100‑B model.</p>
|
193 |
+
<img data-src="assets/memory_bars.svg" alt="Memory bars" />
|
194 |
+
</section>
|
195 |
+
|
196 |
+
<!-- 13 · Modular Transformers: GLM Example -->
|
197 |
+
<section>
|
198 |
+
<h2>Modular Transformers: GLM by Example</h2>
|
199 |
+
<pre><code class="language-python" data-trim>
|
200 |
+
class GlmMLP(Phi3MLP):
|
201 |
+
pass
|
202 |
+
|
203 |
+
class GlmAttention(LlamaAttention):
|
204 |
+
def __init__(self, config, layer_idx=None):
|
205 |
+
super().__init__(config, layer_idx)
|
206 |
+
self.o_proj = nn.Linear(
|
207 |
+
config.num_attention_heads * self.head_dim,
|
208 |
+
config.hidden_size,
|
209 |
+
bias=False,
|
210 |
+
)
|
211 |
+
|
212 |
+
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
|
213 |
+
# Slightly different RoPE
|
214 |
+
...
|
215 |
+
|
216 |
+
class GlmForCausalLM(LlamaForCausalLM):
|
217 |
+
pass
|
218 |
+
</code></pre>
|
219 |
+
<p>AST magic expands this 40‑line prototype into a full modelling file, ready for training.</p>
|
220 |
+
</section>
|
221 |
+
|
222 |
+
<!-- 14 · Rise of Multimodality -->
|
223 |
+
<section>
|
224 |
+
<h2>Rise of Multimodality</h2>
|
225 |
+
<pre><code class="language-python" data-trim data-noescape>
|
226 |
+
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-8B")
|
227 |
+
model = AutoModelForConditionalGeneration.from_pretrained("Qwen/Qwen3-8B")
|
228 |
+
</code></pre>
|
229 |
+
<p class="fragment">Same API across text, vision, and audio: learn once, apply everywhere.</p>
|
230 |
+
</section>
|
231 |
+
|
232 |
+
<!-- 15 · Why Python wins -->
|
233 |
+
<section>
|
234 |
+
<h2>Why Python Wins</h2>
|
235 |
+
<ul>
|
236 |
+
<li>Low entry barrier attracts newcomers and domain specialists alike.</li>
|
237 |
+
<li>High‑level semantics concisely express low‑level intent.</li>
|
238 |
+
<li>The C++/Rust back‑end remains accessible for critical paths.</li>
|
239 |
+
</ul>
|
240 |
+
</section>
|
241 |
+
|
242 |
+
<!-- 16 · Where Python can bite -->
|
243 |
+
<section>
|
244 |
+
<h2>Where Python can bite 🐍</h2>
|
245 |
+
<ul>
|
246 |
+
<li class="fragment">Interpreter overhead hurts microkernels (token‑by‑token decoding).</li>
|
247 |
+
<li class="fragment">The GIL throttles concurrent host‑side work.</li>
|
248 |
+
<li class="fragment">Fresh research code is easy to leave unoptimised.</li>
|
249 |
+
</ul>
|
250 |
+
<p class="fragment">Mitigations: Triton, compiled custom ops, compile‑time fallbacks, and callable kernels.</p>
|
251 |
+
</section>
|
252 |
+
|
253 |
+
<!-- 17 · Kernel Hub -->
|
254 |
+
<section>
|
255 |
+
<h2>Kernel Hub: Optimised Ops from the Community</h2>
|
256 |
+
<p>Kernel Hub lets any Python program <em>download and hot‑load</em> compiled CUDA/C++ kernels directly from the Hugging Face Hub at runtime.</p>
|
257 |
+
<ul>
|
258 |
+
<li><strong>Portable</strong> – kernels work from arbitrary paths outside <code>PYTHONPATH</code>.</li>
|
259 |
+
<li><strong>Unique</strong> – load multiple versions of the same op side‑by‑side in one process.</li>
|
260 |
+
<li><strong>Compatible</strong> – every kernel targets all recent PyTorch wheels (CUDA, ROCm, CPU) and C‑library ABIs.</li>
|
261 |
+
</ul>
|
262 |
+
<p class="fragment">🚀 <strong>Quick start</strong> (requires <code>torch >= 2.5</code>):</p>
|
263 |
+
<pre><code class="language-bash" data-trim>pip install kernels</code></pre>
|
264 |
+
<pre><code class="language-python" data-trim data-noescape>
|
265 |
+
import torch
|
266 |
+
from kernels import get_kernel
|
267 |
+
|
268 |
+
# Download optimised kernels from the Hugging Face Hub
|
269 |
+
activation = get_kernel("kernels-community/activation")
|
270 |
+
|
271 |
+
x = torch.randn(10, 10, dtype=torch.float16, device="cuda")
|
272 |
+
y = torch.empty_like(x)
|
273 |
+
activation.gelu_fast(y, x)
|
274 |
+
print(y)
|
275 |
+
</code></pre>
|
276 |
+
<p class="fragment">Same Transformer code — now with a <strong>3× faster</strong> GELU on A100s.</p>
|
277 |
+
</section>
|
278 |
+
|
279 |
+
<!-- 18 · API design lessons -->
|
280 |
+
<section>
|
281 |
+
<h2>API Design Lessons</h2>
|
282 |
+
<ul>
|
283 |
+
<li>Make easy things obvious, and hard things merely possible.</li>
|
284 |
+
<li>Keep the paper‑to‑repository delta minimal for new models.</li>
|
285 |
+
<li>Hide sharding mechanics; expose developer intent.</li>
|
286 |
+
</ul>
|
287 |
+
<p class="fragment">We tune radios without learning RF theory — ML frameworks should feel as frictionless.</p>
|
288 |
+
</section>
|
289 |
+
|
290 |
+
<!-- 19 · Model Growth by Modality -->
|
291 |
+
<section>
|
292 |
+
<h2>Model Growth by Modality</h2>
|
293 |
+
<iframe src="model_growth.html" width="100%" height="600" style="border:none;"></iframe>
|
294 |
+
</section>
|
295 |
+
|
296 |
+
<!-- 20 · Takeaways -->
|
297 |
+
<section>
|
298 |
+
<h2>Takeaways & The Future</h2>
|
299 |
+
<ul>
|
300 |
+
<li>PyTorch and <code>transformers</code> have grown symbiotically for eight years—expect the spiral to continue.</li>
|
301 |
+
<li>Pythonicity plus pragmatism keeps the barrier to innovation low.</li>
|
302 |
+
<li>Open‑source models are shipping faster, larger, and more multimodal than ever.</li>
|
303 |
+
</ul>
|
304 |
+
<p><a href="https://huggingface.co/transformers/contribute" target="_blank">hf.co/transformers/contribute</a></p>
|
305 |
+
</section>
|
306 |
+
|
307 |
+
</div>
|
308 |
+
</div>
|
309 |
+
|
310 |
+
<!-- Reveal.js core -->
|
311 |
+
<script src="https://cdn.jsdelivr.net/npm/reveal.js@5/dist/reveal.js"></script>
|
312 |
+
<script src="https://cdn.jsdelivr.net/npm/reveal.js@5/plugin/highlight/highlight.js"></script>
|
313 |
+
<script src="https://cdn.jsdelivr.net/npm/reveal.js@5/plugin/notes/notes.js"></script>
|
314 |
+
<!-- Plotly for interactive charts -->
|
315 |
+
<script src="https://cdn.plot.ly/plotly-2.31.1.min.js"></script>
|
316 |
+
<script>
|
317 |
+
Reveal.initialize({
|
318 |
+
hash: true,
|
319 |
+
slideNumber: true,
|
320 |
+
transition: 'slide',
|
321 |
+
backgroundTransition: 'convex',
|
322 |
+
plugins: [ RevealHighlight, RevealNotes ]
|
323 |
+
});
|
324 |
+
</script>
|
325 |
+
</body>
|
326 |
+
</html>
|
v0_index.html
ADDED
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="utf-8" />
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
6 |
+
<title>PyTorch × Transformers Journey</title>
|
7 |
+
|
8 |
+
<!-- Google Fonts -->
|
9 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;800&family=Fira+Code:wght@400;600&display=swap" rel="stylesheet" />
|
10 |
+
|
11 |
+
<!-- Reveal.js core & dark theme base -->
|
12 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/reveal.js@5/dist/reset.css" />
|
13 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/reveal.js@5/dist/reveal.css" />
|
14 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/reveal.js@5/dist/theme/black.css" id="theme" />
|
15 |
+
|
16 |
+
<!-- Highlight.js -->
|
17 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/styles/github-dark.min.css" />
|
18 |
+
|
19 |
+
<!-- Animations -->
|
20 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/animate.css@4/animate.min.css" />
|
21 |
+
|
22 |
+
<!-- Custom palette from uploaded .pptx (approximated) -->
|
23 |
+
<style>
|
24 |
+
:root {
|
25 |
+
/* Extracted dominant colors from PPTX master */
|
26 |
+
--accent-primary: #ee4c2c; /* PyTorch orange‑red */
|
27 |
+
--accent-secondary: #ffb347; /* lighter highlight */
|
28 |
+
--bg-gradient-start: #1b1b1b;
|
29 |
+
--bg-gradient-end: #242424;
|
30 |
+
}
|
31 |
+
|
32 |
+
html, body {
|
33 |
+
font-family: 'Inter', sans-serif;
|
34 |
+
}
|
35 |
+
.reveal .slides {
|
36 |
+
background: linear-gradient(135deg, var(--bg-gradient-start), var(--bg-gradient-end));
|
37 |
+
}
|
38 |
+
.reveal h1,
|
39 |
+
.reveal h2,
|
40 |
+
.reveal h3 {
|
41 |
+
color: var(--accent-primary);
|
42 |
+
font-weight: 800;
|
43 |
+
letter-spacing: -0.5px;
|
44 |
+
}
|
45 |
+
.reveal pre code {
|
46 |
+
font-family: 'Fira Code', monospace;
|
47 |
+
font-size: 0.75em;
|
48 |
+
}
|
49 |
+
.reveal section img,
|
50 |
+
.reveal section svg {
|
51 |
+
border-radius: 1rem;
|
52 |
+
box-shadow: 0 8px 22px rgba(0,0,0,0.4);
|
53 |
+
}
|
54 |
+
.fragment.highlight-current-blue.visible {
|
55 |
+
color: var(--accent-secondary) !important;
|
56 |
+
}
|
57 |
+
/* ——— slide-density patch ——— */
|
58 |
+
.reveal h1 { font-size: 2.6rem; line-height: 1.1; } /* was huge */
|
59 |
+
.reveal h2 { font-size: 1.9rem; line-height: 1.15; }
|
60 |
+
.reveal h3 { font-size: 1.4rem; line-height: 1.2; }
|
61 |
+
|
62 |
+
.reveal p,
|
63 |
+
.reveal li { font-size: 0.9rem; line-height: 1.35; }
|
64 |
+
|
65 |
+
.reveal pre code { font-size: 0.67em; } /* keep code readable */
|
66 |
+
|
67 |
+
@media (max-width: 1024px) { /* fallback for small screens */
|
68 |
+
.reveal h1 { font-size: 2.2rem; }
|
69 |
+
.reveal h2 { font-size: 1.6rem; }
|
70 |
+
}
|
71 |
+
|
72 |
+
/* tweak table cells */
|
73 |
+
.reveal table td,
|
74 |
+
.reveal table th { font-size: 0.85rem; padding: 4px 8px; }
|
75 |
+
|
76 |
+
</style>
|
77 |
+
</head>
|
78 |
+
<body>
|
79 |
+
<div class="reveal">
|
80 |
+
<div class="slides">
|
81 |
+
<!-- 1. Opening -->
|
82 |
+
<section data-auto-animate>
|
83 |
+
<h1 class="animate__animated animate__fadeInDown">PyTorch × Transformers Journey</h1>
|
84 |
+
<h3 class="animate__animated animate__fadeInDown animate__delay-1s">Pythonicity, Autodiff & Modularity in Modern AI</h3>
|
85 |
+
<p class="animate__animated animate__fadeInUp animate__delay-2s">Pablo Montalvo‑Leroux · ML Engineer @ Hugging Face</p>
|
86 |
+
</section>
|
87 |
+
|
88 |
+
<!-- 4. Static vs Dynamic Graphs -->
|
89 |
+
<section>
|
90 |
+
<h2>Transformers and Static vs Dynamic Graphs</h2>
|
91 |
+
<p class="fragment">Static graphs and transformers existence are inversely correlated</p>
|
92 |
+
|
93 |
+
<p class="fragment">torch.compile ≈ sweet spot: author in dynamic, ship in static.</p>
|
94 |
+
</section>
|
95 |
+
|
96 |
+
<section><h2>Clone the paper tonight → tweak tomorrow</h2><p>Research cadence ≈ <strong>hours</strong>; any friction kills momentum.</p><ul><li class="fragment">2018: BERT fine-tunes needed <em>live</em> tensor prints, not graph recompiles.</li><li class="fragment">Community PRs were merged overnight → library credibility snowballed.</li></ul></section>
|
97 |
+
<!-- 6. Mary Shelley mode (unchanged) -->
|
98 |
+
<!--I also need you to include actual wins from transformersm it is a SOURCE OF MODEL TRUTH. We have 315 model defined and more every release - it is the reference, with very simple definitions. The Mary Shelley /Frankeinstein new addition is modular. One Decision we took that was extremely useful a long time ago was to have one file == one model, without any inheritance. All the code defining a model would be self-contained, so anyone could hack it and take a part here and there and modify it. It worked: it drove up the adoption, researchers & students were able to use it. However, we're not putting python first in this design, and we changed it recently WHILE keeping the original intent. Now, we have an AST-baed toposorting algorithm that parses a modular file, say it uses the vision encoder from Qwen 2.5VL, the layernorm of a cohere model, the preprocessing of an allenai model, the attention is original... then this modular file can use all the inheritance across transformers, but it will not be run. Once the algorithm expands the modular file, it becomes a modeling file, very verbose (can be thousands of lines), but still hackable: everything that defines a model is contained in it, and everyone can use it. Everything will work, including autodiff, thanks to torch in the backend.
|
99 |
+
That unlocks as well something I want to talk about: researchers and practicioners can define a model thanks to modular if they have a new hypothesis on say a layernorm, it is very easy to have a pytorch file defining the model AND have it compatible with transformers almost instantly, thus reactivating the power of pythonic inheritance. Paradigms exist for a reason. -->
|
100 |
+
<!-- 3. One Model / One File -->
|
101 |
+
<section>
|
102 |
+
<h2>Transformers Philosophy: One Model · One File pattern</h2>
|
103 |
+
<pre><code data-trim data-noescape class="language-python">
|
104 |
+
class BertEmbeddings(nn.Module):
|
105 |
+
…
|
106 |
+
class BertModel(BertPreTrainedModel):
|
107 |
+
…
|
108 |
+
</code></pre>
|
109 |
+
<p class="fragment">Atomic PRs → faster reviews → community velocity.</p>
|
110 |
+
</section>
|
111 |
+
|
112 |
+
<section>
|
113 |
+
<h2>Back to Python: Modular Mary Shelley Mode</h2>
|
114 |
+
<p>Compose new blocks via subclass & override.</p>
|
115 |
+
<pre><code data-trim data-noescape class="language-python">
|
116 |
+
class LlamaRotaryLoRA(LlamaAttention):
|
117 |
+
def __init__(…):
|
118 |
+
super().__init__(…)
|
119 |
+
self.q_proj = LoRA(self.q_proj)
|
120 |
+
self.apply_rotary()
|
121 |
+
</code></pre>
|
122 |
+
</section>
|
123 |
+
|
124 |
+
<!-- 8. DTensor & TP API -->
|
125 |
+
<!--(New) Slide 9 · Tensor Parallelism — Split the Math, Not the Model
|
126 |
+
What problem? 175 B-param giants overflow a single GPU.
|
127 |
+
Core idea shard each linear kernel across devices
|
128 |
+
Partition How it shards Comm op Notes
|
129 |
+
Column-wise W = [W₀‖W₁] – split on out-features AllGather outputs Input is broadcast; biases split
|
130 |
+
Row-wise Wᵀ = [W₀; W₁] – split on in-features AllReduce outputs Biases replicated
|
131 |
+
|
132 |
+
Rule of thumb – do column-wise first, row-wise second → halves the data you move.
|
133 |
+
Speaker cue: “Think Lego® bricks: first slice them long-ways, then stack the halves.”
|
134 |
+
|
135 |
+
(New) Slide 10 · Sequence ↔ Tensor: Keeping LayerNorms Honest
|
136 |
+
|
137 |
+
Tensor-parallel linear + attention can’t see the whole hidden state; LayerNorm/Dropout must.
|
138 |
+
Sequence Parallelism bridges the gap:
|
139 |
+
|
140 |
+
After row-wise: ReduceScatter → distribute tokens across GPUs.
|
141 |
+
|
142 |
+
LayerNorm / Dropout run locally (full hidden dim, partial sequence).
|
143 |
+
|
144 |
+
Before next TP region: AllGather to rebuild complete activations.
|
145 |
+
|
146 |
+
Caveats
|
147 |
+
|
148 |
+
Extra comm, but still intra-node (NVLink/XeLink)
|
149 |
+
|
150 |
+
Works layer-by-layer; no model-wide refactor needed.
|
151 |
+
|
152 |
+
Speaker cue: “Think relay race—baton = activations, hand-offs cost time, so we keep them short.”
|
153 |
+
(New) Slide 11 · Transformers “tp_plan” — Zero-Config Sharding
|
154 |
+
|
155 |
+
from transformers import AutoModelForCausalLM
|
156 |
+
model = AutoModelForCausalLM.from_pretrained(
|
157 |
+
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
158 |
+
torch_dtype="bfloat16",
|
159 |
+
tp_plan="auto")
|
160 |
+
print(model._tp_plan)
|
161 |
+
|
162 |
+
Example plan (8 GPUs):
|
163 |
+
|
164 |
+
{
|
165 |
+
"layer.*.self_attn.q_proj": "colwise",
|
166 |
+
"layer.*.self_attn.k_proj": "colwise",
|
167 |
+
"layer.*.self_attn.v_proj": "colwise",
|
168 |
+
"layer.*.self_attn.o_proj": "rowwise",
|
169 |
+
"...": "..."
|
170 |
+
}
|
171 |
+
|
172 |
+
Wins
|
173 |
+
|
174 |
+
One line to load a 17 B-param model on 8 GPUs.
|
175 |
+
|
176 |
+
Plan stored as JSON → editable, version-controlled, readable by chip vendors.
|
177 |
+
|
178 |
+
Built on DTensor, DeviceMesh; future-proof for expert/moe shards.
|
179 |
+
|
180 |
+
Speaker cue: “The plan is metadata—the model code stays pristine. Change the JSON, not the network.”-->
|
181 |
+
<section>
|
182 |
+
<h2>DTensor & Tensor‑Parallel API</h2>
|
183 |
+
<ul>
|
184 |
+
<li>Logical tensor view · device mesh</li>
|
185 |
+
<li><code>tp_plan</code> keeps module code intact</li>
|
186 |
+
<li>100B param validation inside HF tests</li>
|
187 |
+
</ul>
|
188 |
+
<img data-src="assets/mesh.svg" alt="Device mesh" />
|
189 |
+
</section>
|
190 |
+
|
191 |
+
<!-- 9. Cache Allocator -->
|
192 |
+
<section>
|
193 |
+
<h2>Smarter Memory: Cache Allocator</h2>
|
194 |
+
<p>0‑copy weight partitioning · 15 % RAM cut on A100</p>
|
195 |
+
<img data-src="assets/memory_bars.svg" alt="Memory bars" />
|
196 |
+
</section>
|
197 |
+
|
198 |
+
<!-- 11. Multimodal Rise -->
|
199 |
+
<section>
|
200 |
+
<h2>Rise of Multimodality</h2>
|
201 |
+
<pre><code data-trim data-noescape class="language-python">
|
202 |
+
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-8B")
|
203 |
+
model = AutoModelForConditionalGeneration.from_pretrained("Qwen/Qwen3-8B")
|
204 |
+
</code></pre>
|
205 |
+
<p class="fragment">Same API across text · vision · audio.</p>
|
206 |
+
</section>
|
207 |
+
|
208 |
+
<!-- 12. Why Python wins -->
|
209 |
+
<section>
|
210 |
+
<h2>Why Python Wins</h2>
|
211 |
+
<ul>
|
212 |
+
<li>Low entry barrier</li>
|
213 |
+
<li>High‑level semantics express low‑level intent</li>
|
214 |
+
<li>Seamless C++/Rust extension points</li>
|
215 |
+
</ul>
|
216 |
+
</section>
|
217 |
+
|
218 |
+
<section>
|
219 |
+
<h2>Where Python can bite 🐍</h2>
|
220 |
+
<ul>
|
221 |
+
<li class="fragment">Interpreter overhead on microkernels (token‑by‑token decode)</li>
|
222 |
+
<li class="fragment">GIL can throttle async host‑side work</li>
|
223 |
+
<li class="fragment">Easy to under-optimize code that is fresh out of the lab.</li>
|
224 |
+
</ul>
|
225 |
+
<p class="fragment">Mitigations: Triton, compiled custom ops, compile‑time fallback, and callable kernels!</p>
|
226 |
+
</section>
|
227 |
+
|
228 |
+
<!-- 10. Community Kernels -->
|
229 |
+
<section>
|
230 |
+
<h2>Community Kernels</h2>
|
231 |
+
<p> New initiative </p>
|
232 |
+
<p> https://huggingface.co/kernels-community </p>
|
233 |
+
</section>
|
234 |
+
|
235 |
+
<!-- 14. Lessons for API designers -->
|
236 |
+
<section>
|
237 |
+
<h2>API Design Lessons</h2>
|
238 |
+
<ul>
|
239 |
+
<li>Make easy things obvious, hard things possible</li>
|
240 |
+
<li>Paper‑to‑repository difference should be minimal</li>
|
241 |
+
<li>Hide sharding, expose intent</li>
|
242 |
+
</ul>
|
243 |
+
<p class="fragment">We want to facilitate adoption. How does a radio work? Would you know how to tune it? </p>
|
244 |
+
<p class="fragment"> How does a computer work? Should you know how it does to be able to navigate the web? </p>
|
245 |
+
|
246 |
+
</section>
|
247 |
+
|
248 |
+
<section>
|
249 |
+
<h2>Model Growth by Modality</h2>
|
250 |
+
<iframe src="model_growth.html" width="100%" height="600" style="border:none;"></iframe>
|
251 |
+
</section>
|
252 |
+
|
253 |
+
<!-- 16. Takeaways -->
|
254 |
+
<section>
|
255 |
+
<h2>Takeaways & The Future </h2>
|
256 |
+
<ul>
|
257 |
+
<li>PyTorch & HF:Transformers grow symbiotically</li>
|
258 |
+
<li>Pythonicity × pragmatism drive adoption</li>
|
259 |
+
<li>Open-source models are being shipped more than ever and accomplishing more than ever thanks to initiatives such as ours</li>
|
260 |
+
</ul>
|
261 |
+
<p><a href="https://huggingface.co/transformers/contribute" target="_blank">hf.co/transformers/contribute</a></p>
|
262 |
+
</section>
|
263 |
+
</div>
|
264 |
+
</div>
|
265 |
+
|
266 |
+
<!-- Reveal.js core -->
|
267 |
+
<script src="https://cdn.jsdelivr.net/npm/reveal.js@5/dist/reveal.js"></script>
|
268 |
+
<script src="https://cdn.jsdelivr.net/npm/reveal.js@5/plugin/highlight/highlight.js"></script>
|
269 |
+
<script src="https://cdn.jsdelivr.net/npm/reveal.js@5/plugin/notes/notes.js"></script>
|
270 |
+
|
271 |
+
<!-- Plotly for interactive charts -->
|
272 |
+
<script src="https://cdn.plot.ly/plotly-2.31.1.min.js"></script>
|
273 |
+
|
274 |
+
<script>
|
275 |
+
/* Initialise Reveal with extras */
|
276 |
+
Reveal.initialize({
|
277 |
+
hash: true,
|
278 |
+
slideNumber: true,
|
279 |
+
transition: 'slide',
|
280 |
+
backgroundTransition: 'convex',
|
281 |
+
plugins: [ RevealHighlight, RevealNotes ]
|
282 |
+
});
|
283 |
+
|
284 |
+
/* LOC growth plot */
|
285 |
+
const locGrowth = document.getElementById('loc-growth');
|
286 |
+
if (locGrowth) {
|
287 |
+
const years = ['2018', '2020', '2022', '2025'];
|
288 |
+
const loc = [200, 40_000, 120_000, 315_000];
|
289 |
+
Plotly.newPlot(locGrowth, [{
|
290 |
+
x: years,
|
291 |
+
y: loc,
|
292 |
+
type: 'scatter',
|
293 |
+
mode: 'lines+markers',
|
294 |
+
hovertemplate: '%{y:,} LOC in %{x}<extra></extra>'
|
295 |
+
}], {title: 'Lines‑of‑Code growth (log‑scale)', yaxis: {type: 'log'}}, {displayModeBar: false});
|
296 |
+
}
|
297 |
+
|
298 |
+
/* compile benchmark */
|
299 |
+
const compilePlot = document.getElementById('compile-plot');
|
300 |
+
if (compilePlot) {
|
301 |
+
Plotly.newPlot(compilePlot, [{
|
302 |
+
x: ['Baseline', 'torch.compile'],
|
303 |
+
y: [100, 62],
|
304 |
+
type: 'bar',
|
305 |
+
text: ['1×', '1.6×'],
|
306 |
+
textposition: 'auto'
|
307 |
+
}], {
|
308 |
+
title: 'Decoder LM latency ↓',
|
309 |
+
margin: {t: 40, l: 40, r: 40, b: 40}
|
310 |
+
}, {displayModeBar: false});
|
311 |
+
}
|
312 |
+
</script>
|
313 |
+
</body>
|
314 |
+
</html>
|