Xenova's picture
Xenova HF Staff
Update index.html
4439ab3 verified
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>SmolVLM Benchmark Demo</title>
<style>
body { font-family: Arial, sans-serif; margin: 20px; }
fieldset { margin-bottom: 20px; padding: 10px; }
legend { font-weight: bold; }
label { display: block; margin-top: 5px; }
input, select { margin-bottom: 5px; width: 100%; max-width: 400px; }
table { border-collapse: collapse; margin-top: 20px; width: 100%; max-width: 600px; }
th, td { border: 1px solid #ccc; padding: 8px; text-align: left; }
button { padding: 10px 20px; }
.model-results { margin-bottom: 40px; }
</style>
</head>
<body>
<h1>SmolVLM Benchmark Demo</h1>
<!-- Model Options (ignored in the benchmark loop) -->
<fieldset id="model-options">
<legend>Model Options (Note: Benchmarking all three SmolVLM models by default)</legend>
<label for="model-id">Select Model ID:</label>
<select id="model-id" disabled>
<option value="hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration">hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration</option>
<option value="HuggingFaceTB/SmolVLM-256M-Instruct" selected>HuggingFaceTB/SmolVLM-256M-Instruct</option>
<option value="HuggingFaceTB/SmolVLM-500M-Instruct">HuggingFaceTB/SmolVLM-500M-Instruct</option>
<option value="HuggingFaceTB/SmolVLM-Instruct">HuggingFaceTB/SmolVLM-Instruct</option>
</select>
<label for="decoder-dtype">Decoder (decoder_model_merged) dtype:</label>
<select id="decoder-dtype">
<option value="fp32">fp32</option>
<option value="fp16">fp16</option>
<option value="q8">q8</option>
<option value="q4" selected>q4</option>
<option value="q4f16">q4f16</option>
</select>
<label for="embed-dtype">Embed Tokens dtype:</label>
<select id="embed-dtype">
<option value="fp32">fp32</option>
<option value="fp16">fp16</option>
<option value="q8">q8</option>
<option value="q4" selected>q4</option>
<option value="q4f16">q4f16</option>
</select>
<label for="vision-dtype">Vision Encoder dtype:</label>
<select id="vision-dtype">
<option value="fp32">fp32</option>
<option value="fp16">fp16</option>
<option value="q8">q8</option>
<option value="q4" selected>q4</option>
<option value="q4f16">q4f16</option>
</select>
</fieldset>
<!-- Hardware Options -->
<fieldset id="hardware-options">
<legend>Hardware Options</legend>
<label for="device">Select Device:</label>
<select id="device">
<option value="wasm">wasm</option>
<option value="webgpu" selected>webgpu</option>
</select>
</fieldset>
<!-- Benchmark Options -->
<fieldset id="benchmark-options">
<legend>Benchmark Options</legend>
<label for="image-url">Image URL:</label>
<input type="text" id="image-url" value="https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg">
<label for="do-split">Do Image Splitting (do_image_splitting)</label>
<input type="checkbox" id="do-split" checked>
<label for="max-tokens">Number of Tokens to Generate:</label>
<input type="number" id="max-tokens" value="32">
<label for="num-runs">Number of Runs:</label>
<input type="number" id="num-runs" value="3">
</fieldset>
<button id="start-benchmark">Start Benchmark</button>
<div id="results"></div>
<script type="module">
import {
AutoProcessor,
AutoModelForVision2Seq,
load_image,
TextStreamer,
} from "https://cdn.jsdelivr.net/npm/@huggingface/[email protected]";
class SmolVLM {
static model = null;
static processor = null;
static model_id = null;
static async getInstance(modelId, dtypeSettings, device, revision) {
if (this.model_id !== modelId) {
await this.model?.dispose();
this.model = null;
this.processor = null;
this.model_id = modelId;
}
if (!this.processor) {
this.processor = await AutoProcessor.from_pretrained(modelId);
}
if (!this.model) {
this.model = await AutoModelForVision2Seq.from_pretrained(modelId, {
dtype: {
embed_tokens: dtypeSettings.embed,
vision_encoder: dtypeSettings.vision,
decoder_model_merged: dtypeSettings.decoder,
},
device: device,
revision,
});
}
return [this.processor, this.model];
}
}
async function runBenchmark() {
document.getElementById("model-options").disabled = true;
document.getElementById("hardware-options").disabled = true;
const resultsDiv = document.getElementById("results");
resultsDiv.innerHTML = "";
const modelIds = {
"HuggingFaceTB/SmolVLM-256M-Instruct": "refs/pr/11",
"HuggingFaceTB/SmolVLM-500M-Instruct": "refs/pr/9",
"HuggingFaceTB/SmolVLM-Instruct": "main"
};
const decoder_dtype = document.getElementById("decoder-dtype").value || "q4";
const embed_dtype = document.getElementById("embed-dtype").value || "q4";
const vision_dtype = document.getElementById("vision-dtype").value || "q4";
const device = document.getElementById("device").value;
const imageUrl = document.getElementById("image-url").value;
const maxTokens = parseInt(document.getElementById("max-tokens").value) || 32;
const numRuns = parseInt(document.getElementById("num-runs").value) || 3;
const doImageSplitting = document.getElementById("do-split").checked;
const dtypeSettings = { decoder: decoder_dtype, embed: embed_dtype, vision: vision_dtype };
const image = await load_image(imageUrl);
for (const [modelId, revision] of Object.entries(modelIds)) {
const modelShortName = modelId.split("/").pop();
const modelSection = document.createElement("div");
modelSection.className = "model-results";
modelSection.innerHTML = `<h2>Benchmarking ${modelShortName}</h2><p id="status-${modelShortName}">Loading...</p><pre id="bar-${modelShortName}">▯▯▯▯▯</pre>`;
resultsDiv.appendChild(modelSection);
const status = document.getElementById(`status-${modelShortName}`);
const bar = document.getElementById(`bar-${modelShortName}`);
try {
status.innerText = "Loading processor and model...";
const [processor, model] = await SmolVLM.getInstance(modelId, dtypeSettings, device, revision);
status.innerText = "Warming up...";
const messages = [{
role: "user",
content: [
{ type: "image" },
{ type: "text", text: "Can you describe this image?" },
],
}];
const text = processor.apply_chat_template(messages, { add_generation_prompt: true });
const inputs = await processor(text, [image], { do_image_splitting: doImageSplitting });
await model.generate({ ...inputs, max_new_tokens: 1 });
let totalTime = 0;
let totalTps = 0;
let runsResults = [];
for (let i = 0; i < numRuns; ++i) {
status.innerText = `Running benchmark... (${i + 1}/${numRuns})`;
bar.innerText = createProgressBar(i + 1, numRuns);
const start = performance.now();
// const [processor, model] = await SmolVLM.getInstance(modelId, dtypeSettings, device, revision);
const text = processor.apply_chat_template(messages, { add_generation_prompt: true });
const inputs = await processor(text, [image], { do_image_splitting: doImageSplitting });
let numTokens = 0;
let startTime;
let tps = 0;
const token_callback_function = () => {
startTime = startTime || performance.now();
tps = (numTokens++ / (performance.now() - startTime)) * 1000;
};
const streamer = new TextStreamer(processor.tokenizer, {
skip_prompt: true,
skip_special_tokens: true,
token_callback_function,
});
await model.generate({
...inputs,
max_new_tokens: maxTokens,
min_new_tokens: maxTokens,
streamer,
});
const elapsed = performance.now() - start;
totalTime += elapsed;
totalTps += tps;
runsResults.push({
run: i + 1,
time: elapsed.toFixed(2),
tps: tps.toFixed(2)
});
}
const avgTime = (totalTime / numRuns).toFixed(2);
const avgTps = (totalTps / numRuns).toFixed(2);
status.innerText = "✅ Done!";
bar.innerText = createProgressBar(numRuns, numRuns);
let tableHtml = "<table>";
tableHtml += "<tr><th>Run</th><th>Execution Time (ms)</th><th>Tokens per Second</th></tr>";
runsResults.forEach(r => {
tableHtml += `<tr><td>${r.run}</td><td>${r.time}</td><td>${r.tps}</td></tr>`;
});
tableHtml += `<tr><td><strong>Average</strong></td><td><strong>${avgTime}</strong></td><td><strong>${avgTps}</strong></td></tr>`;
tableHtml += "</table>";
modelSection.innerHTML += tableHtml;
} catch (e) {
status.innerText = "❌ Error: " + e.toString();
}
}
}
function createProgressBar(current, total) {
const filled = "▮".repeat(current);
const empty = "▯".repeat(total - current);
return filled + empty;
}
document.getElementById("start-benchmark").addEventListener("click", runBenchmark);
</script>
</body>
</html>