smolvlm-web-benchmarking-all

Running

File size: 9,479 Bytes

0dfccf0
a4e4aa9
0dfccf0
 
 
 
 
 
 
 
 
 
 
 
3859cfd
0dfccf0
 
 
 
 
3859cfd
0dfccf0
3859cfd
0dfccf0
3859cfd
0dfccf0
 
 
 
 
 
 
119cf5f
 
 
453253c
 
119cf5f
 
 
 
 
 
 
453253c
 
119cf5f
 
 
 
 
 
 
453253c
 
119cf5f
 
0dfccf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453253c
0dfccf0
 
6ba7708
0dfccf0
 
 
 
 
 
68bfa8d
 
 
 
 
 
 
 
 
 
 
 
4439ab3
68bfa8d
4439ab3
68bfa8d
 
 
0dfccf0
68bfa8d
 
 
 
 
 
 
 
 
 
 
4439ab3
68bfa8d
 
 
0dfccf0
68bfa8d
 
 
 
 
 
 
 
4439ab3
 
 
 
 
0dfccf0
453253c
 
 
68bfa8d
 
453253c
d3024f5
68bfa8d
 
 
 
 
4439ab3
68bfa8d
 
 
 
 
 
 
 
 
 
 
4439ab3
68bfa8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4439ab3
0dfccf0
 
c06b59f
 
 
 
 
 
 
 
68bfa8d
 
 
c06b59f
68bfa8d
0dfccf0
 
68bfa8d
 
 
 
c06b59f
 
68bfa8d
 
 
 
 
 
 
0dfccf0
3859cfd
68bfa8d
3859cfd
 
68bfa8d
 
 
3859cfd
 
 
 
 
 
 
68bfa8d
 
 
 
0dfccf0
 
68bfa8d
 
 
 
 
 
 
 
 
 
 
0dfccf0
a4e4aa9
3859cfd

<!DOCTYPE html>
<html>
<head>
  <meta charset="UTF-8">
  <title>SmolVLM Benchmark Demo</title>
  <style>
    body { font-family: Arial, sans-serif; margin: 20px; }
    fieldset { margin-bottom: 20px; padding: 10px; }
    legend { font-weight: bold; }
    label { display: block; margin-top: 5px; }
    input, select { margin-bottom: 5px; width: 100%; max-width: 400px; }
    table { border-collapse: collapse; margin-top: 20px; width: 100%; max-width: 600px; }
    th, td { border: 1px solid #ccc; padding: 8px; text-align: left; }
    button { padding: 10px 20px; }
    .model-results { margin-bottom: 40px; }
  </style>
</head>
<body>
  <h1>SmolVLM Benchmark Demo</h1>
  
  <!-- Model Options (ignored in the benchmark loop) -->
  <fieldset id="model-options">
    <legend>Model Options (Note: Benchmarking all three SmolVLM models by default)</legend>
    <label for="model-id">Select Model ID:</label>
    <select id="model-id" disabled>
      <option value="hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration">hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration</option>
      <option value="HuggingFaceTB/SmolVLM-256M-Instruct" selected>HuggingFaceTB/SmolVLM-256M-Instruct</option>
      <option value="HuggingFaceTB/SmolVLM-500M-Instruct">HuggingFaceTB/SmolVLM-500M-Instruct</option>
      <option value="HuggingFaceTB/SmolVLM-Instruct">HuggingFaceTB/SmolVLM-Instruct</option>
    </select>

    <label for="decoder-dtype">Decoder (decoder_model_merged) dtype:</label>
  <select id="decoder-dtype">
    <option value="fp32">fp32</option>
    <option value="fp16">fp16</option>
    <option value="q8">q8</option>
    <option value="q4" selected>q4</option>
    <option value="q4f16">q4f16</option>
  </select>
  
  <label for="embed-dtype">Embed Tokens dtype:</label>
  <select id="embed-dtype">
    <option value="fp32">fp32</option>
    <option value="fp16">fp16</option>
    <option value="q8">q8</option>
    <option value="q4" selected>q4</option>
    <option value="q4f16">q4f16</option>
  </select>
  
  <label for="vision-dtype">Vision Encoder dtype:</label>
  <select id="vision-dtype">
    <option value="fp32">fp32</option>
    <option value="fp16">fp16</option>
    <option value="q8">q8</option>
    <option value="q4" selected>q4</option>
    <option value="q4f16">q4f16</option>
  </select>
  </fieldset>

  <!-- Hardware Options -->
  <fieldset id="hardware-options">
    <legend>Hardware Options</legend>
    <label for="device">Select Device:</label>
    <select id="device">
      <option value="wasm">wasm</option>
      <option value="webgpu" selected>webgpu</option>
    </select>
  </fieldset>

  <!-- Benchmark Options -->
  <fieldset id="benchmark-options">
    <legend>Benchmark Options</legend>
    <label for="image-url">Image URL:</label>
    <input type="text" id="image-url" value="https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg">

    <label for="do-split">Do Image Splitting (do_image_splitting)</label>
    <input type="checkbox" id="do-split" checked>

    <label for="max-tokens">Number of Tokens to Generate:</label>
    <input type="number" id="max-tokens" value="32">

    <label for="num-runs">Number of Runs:</label>
    <input type="number" id="num-runs" value="3">
  </fieldset>

  <button id="start-benchmark">Start Benchmark</button>

  <div id="results"></div>

<script type="module">
  import {
    AutoProcessor,
    AutoModelForVision2Seq,
    load_image,
    TextStreamer,
  } from "https://cdn.jsdelivr.net/npm/@huggingface/[email protected]";

  class SmolVLM {
    static model = null;
    static processor = null;
    static model_id = null;
    static async getInstance(modelId, dtypeSettings, device, revision) {
      if (this.model_id !== modelId) {
        await this.model?.dispose();
        this.model = null;
        this.processor = null;
        this.model_id = modelId;
      }
      if (!this.processor) {
        this.processor = await AutoProcessor.from_pretrained(modelId);
      }
      if (!this.model) {
        this.model = await AutoModelForVision2Seq.from_pretrained(modelId, {
          dtype: {
            embed_tokens: dtypeSettings.embed,
            vision_encoder: dtypeSettings.vision,
            decoder_model_merged: dtypeSettings.decoder,
          },
          device: device,
          revision,
        });
      }
      return [this.processor, this.model];
    }
  }

  async function runBenchmark() {
    document.getElementById("model-options").disabled = true;
    document.getElementById("hardware-options").disabled = true;
    const resultsDiv = document.getElementById("results");
    resultsDiv.innerHTML = "";

    const modelIds = {
      "HuggingFaceTB/SmolVLM-256M-Instruct": "refs/pr/11",
      "HuggingFaceTB/SmolVLM-500M-Instruct": "refs/pr/9",
      "HuggingFaceTB/SmolVLM-Instruct": "main"
    };

    const decoder_dtype = document.getElementById("decoder-dtype").value || "q4";
    const embed_dtype = document.getElementById("embed-dtype").value || "q4";
    const vision_dtype = document.getElementById("vision-dtype").value || "q4";
    const device = document.getElementById("device").value;
    const imageUrl = document.getElementById("image-url").value;
    const maxTokens = parseInt(document.getElementById("max-tokens").value) || 32;
    const numRuns = parseInt(document.getElementById("num-runs").value) || 3;
    const doImageSplitting = document.getElementById("do-split").checked;

    const dtypeSettings = { decoder: decoder_dtype, embed: embed_dtype, vision: vision_dtype };
    const image = await load_image(imageUrl);

    for (const [modelId, revision] of Object.entries(modelIds)) {
      const modelShortName = modelId.split("/").pop();
      const modelSection = document.createElement("div");
      modelSection.className = "model-results";
      modelSection.innerHTML = `<h2>Benchmarking ${modelShortName}</h2><p id="status-${modelShortName}">Loading...</p><pre id="bar-${modelShortName}">▯▯▯▯▯</pre>`;
      resultsDiv.appendChild(modelSection);

      const status = document.getElementById(`status-${modelShortName}`);
      const bar = document.getElementById(`bar-${modelShortName}`);

      try {
        status.innerText = "Loading processor and model...";
        const [processor, model] = await SmolVLM.getInstance(modelId, dtypeSettings, device, revision);

        status.innerText = "Warming up...";
        const messages = [{
          role: "user",
          content: [
            { type: "image" },
            { type: "text", text: "Can you describe this image?" },
          ],
        }];
        const text = processor.apply_chat_template(messages, { add_generation_prompt: true });
        const inputs = await processor(text, [image], { do_image_splitting: doImageSplitting });
        await model.generate({ ...inputs, max_new_tokens: 1 });

        let totalTime = 0;
        let totalTps = 0;
        let runsResults = [];

        for (let i = 0; i < numRuns; ++i) {
          status.innerText = `Running benchmark... (${i + 1}/${numRuns})`;
          bar.innerText = createProgressBar(i + 1, numRuns);
          const start = performance.now();
          // const [processor, model] = await SmolVLM.getInstance(modelId, dtypeSettings, device, revision);
          const text = processor.apply_chat_template(messages, { add_generation_prompt: true });
          const inputs = await processor(text, [image], { do_image_splitting: doImageSplitting });
                    
          let numTokens = 0;
          let startTime;
          let tps = 0;
          const token_callback_function = () => {
            startTime = startTime || performance.now();
            tps = (numTokens++ / (performance.now() - startTime)) * 1000;
          };
          const streamer = new TextStreamer(processor.tokenizer, {
            skip_prompt: true,
            skip_special_tokens: true,
            token_callback_function,
          });
          await model.generate({
            ...inputs,
            max_new_tokens: maxTokens,
            min_new_tokens: maxTokens,
            streamer,
          });
          const elapsed = performance.now() - start;


          totalTime += elapsed;
          totalTps += tps;
          runsResults.push({
            run: i + 1,
            time: elapsed.toFixed(2),
            tps: tps.toFixed(2)
          });
        }

        const avgTime = (totalTime / numRuns).toFixed(2);
        const avgTps = (totalTps / numRuns).toFixed(2);
        status.innerText = "✅ Done!";
        bar.innerText = createProgressBar(numRuns, numRuns);

        let tableHtml = "<table>";
        tableHtml += "<tr><th>Run</th><th>Execution Time (ms)</th><th>Tokens per Second</th></tr>";
        runsResults.forEach(r => {
          tableHtml += `<tr><td>${r.run}</td><td>${r.time}</td><td>${r.tps}</td></tr>`;
        });
        tableHtml += `<tr><td><strong>Average</strong></td><td><strong>${avgTime}</strong></td><td><strong>${avgTps}</strong></td></tr>`;
        tableHtml += "</table>";
        modelSection.innerHTML += tableHtml;

      } catch (e) {
        status.innerText = "❌ Error: " + e.toString();
      }
    }
  }

  function createProgressBar(current, total) {
    const filled = "▮".repeat(current);
    const empty = "▯".repeat(total - current);
    return filled + empty;
  }

  document.getElementById("start-benchmark").addEventListener("click", runBenchmark);
</script>

</body>
</html>