File size: 13,672 Bytes
d25266e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>PyTorch × Transformers Journey</title>

  <!-- Google Fonts -->
  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;800&family=Fira+Code:wght@400;600&display=swap" rel="stylesheet" />

  <!-- Reveal.js core & dark theme base -->
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/reveal.js@5/dist/reset.css" />
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/reveal.js@5/dist/reveal.css" />
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/reveal.js@5/dist/theme/black.css" id="theme" />

  <!-- Highlight.js -->
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/styles/github-dark.min.css" />

  <!-- Animations -->
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/animate.css@4/animate.min.css" />

  <!-- Custom palette from uploaded .pptx (approximated) -->
  <style>
    :root {
      /* Extracted dominant colors from PPTX master */
      --accent-primary: #ee4c2c; /* PyTorch orange‑red */
      --accent-secondary: #ffb347; /* lighter highlight */
      --bg-gradient-start: #1b1b1b;
      --bg-gradient-end: #242424;
    }

    html, body {
      font-family: 'Inter', sans-serif;
    }
    .reveal .slides {
      background: linear-gradient(135deg, var(--bg-gradient-start), var(--bg-gradient-end));
    }
    .reveal h1,
    .reveal h2,
    .reveal h3 {
      color: var(--accent-primary);
      font-weight: 800;
      letter-spacing: -0.5px;
    }
    .reveal pre code {
      font-family: 'Fira Code', monospace;
      font-size: 0.75em;
    }
    .reveal section img,
    .reveal section svg {
      border-radius: 1rem;
      box-shadow: 0 8px 22px rgba(0,0,0,0.4);
    }
    .fragment.highlight-current-blue.visible {
      color: var(--accent-secondary) !important;
    }
    /* ———  slide-density patch  ——— */
    .reveal h1        { font-size: 2.6rem; line-height: 1.1; }   /* was huge */
    .reveal h2        { font-size: 1.9rem; line-height: 1.15; }
    .reveal h3        { font-size: 1.4rem; line-height: 1.2;  }

    .reveal p,
    .reveal li        { font-size: 0.9rem; line-height: 1.35; }

    .reveal pre code  { font-size: 0.67em; }                     /* keep code readable */

    @media (max-width: 1024px) {                                  /* fallback for small screens */
      .reveal h1 { font-size: 2.2rem; }
      .reveal h2 { font-size: 1.6rem; }
    }

    /* tweak table cells */
    .reveal table td,
    .reveal table th { font-size: 0.85rem; padding: 4px 8px; }

  </style>
</head>
<body>
  <div class="reveal">
    <div class="slides">
      <!-- 1. Opening -->
      <section data-auto-animate>
        <h1 class="animate__animated animate__fadeInDown">PyTorch × Transformers Journey</h1>
        <h3 class="animate__animated animate__fadeInDown animate__delay-1s">Pythonicity, Autodiff & Modularity in Modern AI</h3>
        <p class="animate__animated animate__fadeInUp animate__delay-2s">Pablo Montalvo‑Leroux · ML Engineer @ Hugging Face</p>
      </section>

      <!-- 4. Static vs Dynamic Graphs -->
      <section>
        <h2>Transformers and Static vs Dynamic Graphs</h2>
        <p class="fragment">Static graphs and transformers existence are inversely correlated</p>

        <p class="fragment">torch.compile ≈ sweet spot: author in dynamic, ship in static.</p>
      </section>

      <section><h2>Clone the paper tonight → tweak tomorrow</h2><p>Research cadence ≈ <strong>hours</strong>; any friction kills momentum.</p><ul><li class="fragment">2018: BERT fine-tunes needed <em>live</em> tensor prints, not graph recompiles.</li><li class="fragment">Community PRs were merged overnight → library credibility snowballed.</li></ul></section>
      <!-- 6. Mary Shelley mode (unchanged) -->
      <!--I also need you to include actual wins from transformersm it is a SOURCE OF MODEL TRUTH. We have 315 model defined and more every release - it is the reference, with very simple definitions. The Mary Shelley /Frankeinstein new addition is modular. One Decision we took that was extremely useful a long time ago was to have one file == one model, without any inheritance. All the code defining a model would be self-contained, so anyone could hack it and take a part here and there and modify it. It worked: it drove up the adoption, researchers & students were able to use it. However, we're not putting python first in this design, and we changed it recently WHILE keeping the original intent. Now, we have an AST-baed toposorting algorithm that parses a modular file, say it uses the vision encoder from Qwen 2.5VL, the layernorm of a cohere model, the preprocessing of an allenai model, the attention is original... then this modular file can use all the inheritance across transformers, but it will not be run. Once the algorithm expands the modular file, it becomes a modeling file, very verbose (can be thousands of lines), but still hackable: everything that defines a model is contained in it, and everyone can use it. Everything will work, including autodiff, thanks to torch in the backend.
      That unlocks as well something I want to talk about: researchers and practicioners can define a model thanks to modular if they have a new hypothesis on say a layernorm, it is very easy to have a pytorch file defining the model AND have it compatible with transformers almost instantly, thus reactivating the power of pythonic inheritance. Paradigms exist for a reason. -->
      <!-- 3. One Model / One File -->
            <section>
              <h2>Transformers Philosophy: One Model · One File pattern</h2>
              <pre><code data-trim data-noescape class="language-python">
      class BertEmbeddings(nn.Module):
          …
      class BertModel(BertPreTrainedModel):
          …
              </code></pre>
              <p class="fragment">Atomic PRs → faster reviews → community velocity.</p>
            </section>
      
      <section>
        <h2>Back to Python: Modular Mary Shelley Mode</h2>
        <p>Compose new blocks via subclass & override.</p>
        <pre><code data-trim data-noescape class="language-python">
class LlamaRotaryLoRA(LlamaAttention):
    def __init__(…):
        super().__init__(…)
        self.q_proj = LoRA(self.q_proj)
        self.apply_rotary()
        </code></pre>
      </section>
    
      <!-- 8. DTensor & TP API -->
      <!--(New) Slide 9 · Tensor Parallelism — Split the Math, Not the Model
    What problem? 175 B-param giants overflow a single GPU.
    Core idea shard each linear kernel across devices
    Partition	How it shards	Comm op	Notes
    Column-wise	W = [W₀‖W₁] – split on out-features	AllGather outputs	Input is broadcast; biases split
    Row-wise	Wᵀ = [W₀; W₁] – split on in-features	AllReduce outputs	Biases replicated

        Rule of thumb – do column-wise first, row-wise second → halves the data you move.
        Speaker cue: “Think Lego® bricks: first slice them long-ways, then stack the halves.”

    (New) Slide 10 · Sequence ↔ Tensor: Keeping LayerNorms Honest

    Tensor-parallel linear + attention can’t see the whole hidden state; LayerNorm/Dropout must.
    Sequence Parallelism bridges the gap:

        After row-wise: ReduceScatter → distribute tokens across GPUs.

        LayerNorm / Dropout run locally (full hidden dim, partial sequence).

        Before next TP region: AllGather to rebuild complete activations.

    Caveats

        Extra comm, but still intra-node (NVLink/XeLink)

        Works layer-by-layer; no model-wide refactor needed.

    Speaker cue: “Think relay race—baton = activations, hand-offs cost time, so we keep them short.”
    (New) Slide 11 · Transformers “tp_plan” — Zero-Config Sharding

    from transformers import AutoModelForCausalLM
    model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Llama-4-Scout-17B-16E-Instruct",
            torch_dtype="bfloat16",
            tp_plan="auto")
    print(model._tp_plan)

    Example plan (8 GPUs):

    {
      "layer.*.self_attn.q_proj": "colwise",
      "layer.*.self_attn.k_proj": "colwise",
      "layer.*.self_attn.v_proj": "colwise",
      "layer.*.self_attn.o_proj": "rowwise",
      "...": "..."
    }

    Wins

        One line to load a 17 B-param model on 8 GPUs.

        Plan stored as JSON → editable, version-controlled, readable by chip vendors.

        Built on DTensor, DeviceMesh; future-proof for expert/moe shards.

    Speaker cue: “The plan is metadata—the model code stays pristine. Change the JSON, not the network.”-->
      <section>
        <h2>DTensor & Tensor‑Parallel API</h2>
        <ul>
          <li>Logical tensor view · device mesh</li>
          <li><code>tp_plan</code> keeps module code intact</li>
          <li>100B param validation inside HF tests</li>
        </ul>
        <img data-src="assets/mesh.svg" alt="Device mesh" />
      </section>

      <!-- 9. Cache Allocator -->
      <section>
        <h2>Smarter Memory: Cache Allocator</h2>
        <p>0‑copy weight partitioning · 15 % RAM cut on A100</p>
        <img data-src="assets/memory_bars.svg" alt="Memory bars" />
      </section>

      <!-- 11. Multimodal Rise -->
      <section>
        <h2>Rise of Multimodality</h2>
        <pre><code data-trim data-noescape class="language-python">
            processor = AutoProcessor.from_pretrained("Qwen/Qwen3-8B")
            model = AutoModelForConditionalGeneration.from_pretrained("Qwen/Qwen3-8B")
        </code></pre>
        <p class="fragment">Same API across text · vision · audio.</p>
      </section>

      <!-- 12. Why Python wins -->
      <section>
        <h2>Why Python Wins</h2>
        <ul>
          <li>Low entry barrier</li>
          <li>High‑level semantics express low‑level intent</li>
          <li>Seamless C++/Rust extension points</li>
        </ul>
      </section>

      <section>
        <h2>Where Python can bite 🐍</h2>
        <ul>
          <li class="fragment">Interpreter overhead on microkernels (token‑by‑token decode)</li>
          <li class="fragment">GIL can throttle async host‑side work</li>
          <li class="fragment">Easy to under-optimize code that is fresh out of the lab.</li>
        </ul>
        <p class="fragment">Mitigations: Triton, compiled custom ops, compile‑time fallback, and callable kernels!</p>
      </section>

      <!-- 10. Community Kernels -->
      <section>
        <h2>Community Kernels</h2>
        <p> New initiative </p>
        <p> https://huggingface.co/kernels-community </p>
      </section>

      <!-- 14. Lessons for API designers -->
      <section>
        <h2>API Design Lessons</h2>
        <ul>
          <li>Make easy things obvious, hard things possible</li>
          <li>Paper‑to‑repository difference should be minimal</li>
          <li>Hide sharding, expose intent</li>
        </ul>
        <p class="fragment">We want to facilitate adoption. How does a radio work? Would you know how to tune it? </p>
          <p class="fragment">  How does a computer work? Should you know how it does to be able to navigate the web? </p>

      </section>

      <section>
        <h2>Model Growth by Modality</h2>
        <iframe src="model_growth.html" width="100%" height="600" style="border:none;"></iframe>
      </section>

      <!-- 16. Takeaways -->
      <section>
        <h2>Takeaways & The Future </h2>
        <ul>
          <li>PyTorch & HF:Transformers grow symbiotically</li>
          <li>Pythonicity × pragmatism drive adoption</li>
          <li>Open-source models are being shipped more than ever and accomplishing more than ever thanks to initiatives such as ours</li>
        </ul>
        <p><a href="https://huggingface.co/transformers/contribute" target="_blank">hf.co/transformers/contribute</a></p>
      </section>
    </div>
  </div>

  <!-- Reveal.js core -->
  <script src="https://cdn.jsdelivr.net/npm/reveal.js@5/dist/reveal.js"></script>
  <script src="https://cdn.jsdelivr.net/npm/reveal.js@5/plugin/highlight/highlight.js"></script>
  <script src="https://cdn.jsdelivr.net/npm/reveal.js@5/plugin/notes/notes.js"></script>

  <!-- Plotly for interactive charts -->
  <script src="https://cdn.plot.ly/plotly-2.31.1.min.js"></script>

  <script>
    /* Initialise Reveal with extras */
    Reveal.initialize({
      hash: true,
      slideNumber: true,
      transition: 'slide',
      backgroundTransition: 'convex',
      plugins: [ RevealHighlight, RevealNotes ]
    });

    /* LOC growth plot */
    const locGrowth = document.getElementById('loc-growth');
    if (locGrowth) {
      const years = ['2018', '2020', '2022', '2025'];
      const loc = [200, 40_000, 120_000, 315_000];
      Plotly.newPlot(locGrowth, [{
        x: years,
        y: loc,
        type: 'scatter',
        mode: 'lines+markers',
        hovertemplate: '%{y:,} LOC in %{x}<extra></extra>'
      }], {title: 'Lines‑of‑Code growth (log‑scale)', yaxis: {type: 'log'}}, {displayModeBar: false});
    }

    /* compile benchmark */
    const compilePlot = document.getElementById('compile-plot');
    if (compilePlot) {
      Plotly.newPlot(compilePlot, [{
        x: ['Baseline', 'torch.compile'],
        y: [100, 62],
        type: 'bar',
        text: ['1×', '1.6×'],
        textposition: 'auto'
      }], {
        title: 'Decoder LM latency ↓',
        margin: {t: 40, l: 40, r: 40, b: 40}
      }, {displayModeBar: false});
    }
  </script>
</body>
</html>