File size: 9,479 Bytes
0dfccf0
a4e4aa9
0dfccf0
 
 
 
 
 
 
 
 
 
 
 
3859cfd
0dfccf0
 
 
 
 
3859cfd
0dfccf0
3859cfd
0dfccf0
3859cfd
0dfccf0
 
 
 
 
 
 
119cf5f
 
 
453253c
 
119cf5f
 
 
 
 
 
 
453253c
 
119cf5f
 
 
 
 
 
 
453253c
 
119cf5f
 
0dfccf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453253c
0dfccf0
 
6ba7708
0dfccf0
 
 
 
 
 
68bfa8d
 
 
 
 
 
 
 
 
 
 
 
4439ab3
68bfa8d
4439ab3
68bfa8d
 
 
0dfccf0
68bfa8d
 
 
 
 
 
 
 
 
 
 
4439ab3
68bfa8d
 
 
0dfccf0
68bfa8d
 
 
 
 
 
 
 
4439ab3
 
 
 
 
0dfccf0
453253c
 
 
68bfa8d
 
453253c
d3024f5
68bfa8d
 
 
 
 
4439ab3
68bfa8d
 
 
 
 
 
 
 
 
 
 
4439ab3
68bfa8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4439ab3
0dfccf0
 
c06b59f
 
 
 
 
 
 
 
68bfa8d
 
 
c06b59f
68bfa8d
0dfccf0
 
68bfa8d
 
 
 
c06b59f
 
68bfa8d
 
 
 
 
 
 
0dfccf0
3859cfd
68bfa8d
3859cfd
 
68bfa8d
 
 
3859cfd
 
 
 
 
 
 
68bfa8d
 
 
 
0dfccf0
 
68bfa8d
 
 
 
 
 
 
 
 
 
 
0dfccf0
a4e4aa9
3859cfd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
<!DOCTYPE html>
<html>
<head>
  <meta charset="UTF-8">
  <title>SmolVLM Benchmark Demo</title>
  <style>
    body { font-family: Arial, sans-serif; margin: 20px; }
    fieldset { margin-bottom: 20px; padding: 10px; }
    legend { font-weight: bold; }
    label { display: block; margin-top: 5px; }
    input, select { margin-bottom: 5px; width: 100%; max-width: 400px; }
    table { border-collapse: collapse; margin-top: 20px; width: 100%; max-width: 600px; }
    th, td { border: 1px solid #ccc; padding: 8px; text-align: left; }
    button { padding: 10px 20px; }
    .model-results { margin-bottom: 40px; }
  </style>
</head>
<body>
  <h1>SmolVLM Benchmark Demo</h1>
  
  <!-- Model Options (ignored in the benchmark loop) -->
  <fieldset id="model-options">
    <legend>Model Options (Note: Benchmarking all three SmolVLM models by default)</legend>
    <label for="model-id">Select Model ID:</label>
    <select id="model-id" disabled>
      <option value="hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration">hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration</option>
      <option value="HuggingFaceTB/SmolVLM-256M-Instruct" selected>HuggingFaceTB/SmolVLM-256M-Instruct</option>
      <option value="HuggingFaceTB/SmolVLM-500M-Instruct">HuggingFaceTB/SmolVLM-500M-Instruct</option>
      <option value="HuggingFaceTB/SmolVLM-Instruct">HuggingFaceTB/SmolVLM-Instruct</option>
    </select>

    <label for="decoder-dtype">Decoder (decoder_model_merged) dtype:</label>
  <select id="decoder-dtype">
    <option value="fp32">fp32</option>
    <option value="fp16">fp16</option>
    <option value="q8">q8</option>
    <option value="q4" selected>q4</option>
    <option value="q4f16">q4f16</option>
  </select>
  
  <label for="embed-dtype">Embed Tokens dtype:</label>
  <select id="embed-dtype">
    <option value="fp32">fp32</option>
    <option value="fp16">fp16</option>
    <option value="q8">q8</option>
    <option value="q4" selected>q4</option>
    <option value="q4f16">q4f16</option>
  </select>
  
  <label for="vision-dtype">Vision Encoder dtype:</label>
  <select id="vision-dtype">
    <option value="fp32">fp32</option>
    <option value="fp16">fp16</option>
    <option value="q8">q8</option>
    <option value="q4" selected>q4</option>
    <option value="q4f16">q4f16</option>
  </select>
  </fieldset>

  <!-- Hardware Options -->
  <fieldset id="hardware-options">
    <legend>Hardware Options</legend>
    <label for="device">Select Device:</label>
    <select id="device">
      <option value="wasm">wasm</option>
      <option value="webgpu" selected>webgpu</option>
    </select>
  </fieldset>

  <!-- Benchmark Options -->
  <fieldset id="benchmark-options">
    <legend>Benchmark Options</legend>
    <label for="image-url">Image URL:</label>
    <input type="text" id="image-url" value="https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg">

    <label for="do-split">Do Image Splitting (do_image_splitting)</label>
    <input type="checkbox" id="do-split" checked>

    <label for="max-tokens">Number of Tokens to Generate:</label>
    <input type="number" id="max-tokens" value="32">

    <label for="num-runs">Number of Runs:</label>
    <input type="number" id="num-runs" value="3">
  </fieldset>

  <button id="start-benchmark">Start Benchmark</button>

  <div id="results"></div>

<script type="module">
  import {
    AutoProcessor,
    AutoModelForVision2Seq,
    load_image,
    TextStreamer,
  } from "https://cdn.jsdelivr.net/npm/@huggingface/[email protected]";

  class SmolVLM {
    static model = null;
    static processor = null;
    static model_id = null;
    static async getInstance(modelId, dtypeSettings, device, revision) {
      if (this.model_id !== modelId) {
        await this.model?.dispose();
        this.model = null;
        this.processor = null;
        this.model_id = modelId;
      }
      if (!this.processor) {
        this.processor = await AutoProcessor.from_pretrained(modelId);
      }
      if (!this.model) {
        this.model = await AutoModelForVision2Seq.from_pretrained(modelId, {
          dtype: {
            embed_tokens: dtypeSettings.embed,
            vision_encoder: dtypeSettings.vision,
            decoder_model_merged: dtypeSettings.decoder,
          },
          device: device,
          revision,
        });
      }
      return [this.processor, this.model];
    }
  }

  async function runBenchmark() {
    document.getElementById("model-options").disabled = true;
    document.getElementById("hardware-options").disabled = true;
    const resultsDiv = document.getElementById("results");
    resultsDiv.innerHTML = "";

    const modelIds = {
      "HuggingFaceTB/SmolVLM-256M-Instruct": "refs/pr/11",
      "HuggingFaceTB/SmolVLM-500M-Instruct": "refs/pr/9",
      "HuggingFaceTB/SmolVLM-Instruct": "main"
    };

    const decoder_dtype = document.getElementById("decoder-dtype").value || "q4";
    const embed_dtype = document.getElementById("embed-dtype").value || "q4";
    const vision_dtype = document.getElementById("vision-dtype").value || "q4";
    const device = document.getElementById("device").value;
    const imageUrl = document.getElementById("image-url").value;
    const maxTokens = parseInt(document.getElementById("max-tokens").value) || 32;
    const numRuns = parseInt(document.getElementById("num-runs").value) || 3;
    const doImageSplitting = document.getElementById("do-split").checked;

    const dtypeSettings = { decoder: decoder_dtype, embed: embed_dtype, vision: vision_dtype };
    const image = await load_image(imageUrl);

    for (const [modelId, revision] of Object.entries(modelIds)) {
      const modelShortName = modelId.split("/").pop();
      const modelSection = document.createElement("div");
      modelSection.className = "model-results";
      modelSection.innerHTML = `<h2>Benchmarking ${modelShortName}</h2><p id="status-${modelShortName}">Loading...</p><pre id="bar-${modelShortName}">▯▯▯▯▯</pre>`;
      resultsDiv.appendChild(modelSection);

      const status = document.getElementById(`status-${modelShortName}`);
      const bar = document.getElementById(`bar-${modelShortName}`);

      try {
        status.innerText = "Loading processor and model...";
        const [processor, model] = await SmolVLM.getInstance(modelId, dtypeSettings, device, revision);

        status.innerText = "Warming up...";
        const messages = [{
          role: "user",
          content: [
            { type: "image" },
            { type: "text", text: "Can you describe this image?" },
          ],
        }];
        const text = processor.apply_chat_template(messages, { add_generation_prompt: true });
        const inputs = await processor(text, [image], { do_image_splitting: doImageSplitting });
        await model.generate({ ...inputs, max_new_tokens: 1 });

        let totalTime = 0;
        let totalTps = 0;
        let runsResults = [];

        for (let i = 0; i < numRuns; ++i) {
          status.innerText = `Running benchmark... (${i + 1}/${numRuns})`;
          bar.innerText = createProgressBar(i + 1, numRuns);
          const start = performance.now();
          // const [processor, model] = await SmolVLM.getInstance(modelId, dtypeSettings, device, revision);
          const text = processor.apply_chat_template(messages, { add_generation_prompt: true });
          const inputs = await processor(text, [image], { do_image_splitting: doImageSplitting });
                    
          let numTokens = 0;
          let startTime;
          let tps = 0;
          const token_callback_function = () => {
            startTime = startTime || performance.now();
            tps = (numTokens++ / (performance.now() - startTime)) * 1000;
          };
          const streamer = new TextStreamer(processor.tokenizer, {
            skip_prompt: true,
            skip_special_tokens: true,
            token_callback_function,
          });
          await model.generate({
            ...inputs,
            max_new_tokens: maxTokens,
            min_new_tokens: maxTokens,
            streamer,
          });
          const elapsed = performance.now() - start;


          totalTime += elapsed;
          totalTps += tps;
          runsResults.push({
            run: i + 1,
            time: elapsed.toFixed(2),
            tps: tps.toFixed(2)
          });
        }

        const avgTime = (totalTime / numRuns).toFixed(2);
        const avgTps = (totalTps / numRuns).toFixed(2);
        status.innerText = "✅ Done!";
        bar.innerText = createProgressBar(numRuns, numRuns);

        let tableHtml = "<table>";
        tableHtml += "<tr><th>Run</th><th>Execution Time (ms)</th><th>Tokens per Second</th></tr>";
        runsResults.forEach(r => {
          tableHtml += `<tr><td>${r.run}</td><td>${r.time}</td><td>${r.tps}</td></tr>`;
        });
        tableHtml += `<tr><td><strong>Average</strong></td><td><strong>${avgTime}</strong></td><td><strong>${avgTps}</strong></td></tr>`;
        tableHtml += "</table>";
        modelSection.innerHTML += tableHtml;

      } catch (e) {
        status.innerText = "❌ Error: " + e.toString();
      }
    }
  }

  function createProgressBar(current, total) {
    const filled = "▮".repeat(current);
    const empty = "▯".repeat(total - current);
    return filled + empty;
  }

  document.getElementById("start-benchmark").addEventListener("click", runBenchmark);
</script>

</body>
</html>