|
import { |
|
AutoTokenizer, |
|
CLIPTextModelWithProjection, |
|
AutoProcessor, |
|
CLIPVisionModelWithProjection, |
|
RawImage, |
|
dot, |
|
softmax, |
|
} from "https://cdn.jsdelivr.net/npm/@huggingface/[email protected]"; |
|
|
|
|
|
const status = document.getElementById("status"); |
|
const container = document.getElementById("container"); |
|
const video = document.getElementById("video"); |
|
const labelsInput = document.getElementById("labels"); |
|
const templateInput = document.getElementById("template"); |
|
const overlay = document.getElementById("overlay"); |
|
|
|
status.textContent = "Loading model (88MB)..."; |
|
|
|
const model_id = "Xenova/mobileclip_s0"; |
|
let tokenizer, text_model, processor, vision_model; |
|
try { |
|
|
|
tokenizer = await AutoTokenizer.from_pretrained(model_id); |
|
text_model = await CLIPTextModelWithProjection.from_pretrained(model_id, { |
|
device: "wasm", |
|
dtype: "q8", |
|
}); |
|
|
|
|
|
processor = await AutoProcessor.from_pretrained(model_id); |
|
vision_model = await CLIPVisionModelWithProjection.from_pretrained(model_id, { |
|
device: "webnn", |
|
dtype: "fp32", |
|
}); |
|
} catch (err) { |
|
console.error(err); |
|
status.textContent = err.message; |
|
alert(err.message); |
|
throw err; |
|
} |
|
|
|
labelsInput.disabled = false; |
|
templateInput.disabled = false; |
|
|
|
status.textContent = "Ready"; |
|
|
|
|
|
const exp_logit_scale = Math.exp(4.6052); |
|
|
|
const IMAGE_SIZE = 224; |
|
const canvas = document.createElement("canvas"); |
|
canvas.width = canvas.height = IMAGE_SIZE; |
|
const context = canvas.getContext("2d", { willReadFrequently: true }); |
|
|
|
let isProcessing = false; |
|
let previousTime; |
|
let textEmbeddings; |
|
let prevTextInputs; |
|
let prevTemplate; |
|
let labels; |
|
|
|
function onFrameUpdate() { |
|
if (!isProcessing) { |
|
isProcessing = true; |
|
(async function () { |
|
|
|
if ( |
|
prevTextInputs !== labelsInput.value || |
|
prevTemplate !== templateInput.value |
|
) { |
|
textEmbeddings = null; |
|
prevTextInputs = labelsInput.value; |
|
prevTemplate = templateInput.value; |
|
labels = prevTextInputs.split(/\s*,\s*/).filter((x) => x); |
|
|
|
if (labels.length > 0) { |
|
const texts = labels.map((x) => |
|
templateInput.value.replaceAll("{}", x), |
|
); |
|
|
|
const text_inputs = tokenizer(texts, { |
|
padding: "max_length", |
|
truncation: true, |
|
}); |
|
|
|
|
|
const { text_embeds } = await text_model(text_inputs); |
|
textEmbeddings = text_embeds.normalize().tolist(); |
|
} else { |
|
overlay.innerHTML = ""; |
|
} |
|
} |
|
|
|
if (textEmbeddings) { |
|
|
|
context.drawImage(video, 0, 0, IMAGE_SIZE, IMAGE_SIZE); |
|
const pixelData = context.getImageData( |
|
0, |
|
0, |
|
IMAGE_SIZE, |
|
IMAGE_SIZE, |
|
).data; |
|
const image = new RawImage(pixelData, IMAGE_SIZE, IMAGE_SIZE, 4); |
|
|
|
const image_inputs = await processor(image); |
|
|
|
|
|
const { image_embeds } = await vision_model(image_inputs); |
|
const imageEmbedding = image_embeds.normalize().tolist()[0]; |
|
|
|
|
|
const similarities = textEmbeddings.map( |
|
(x) => dot(x, imageEmbedding) * exp_logit_scale, |
|
); |
|
|
|
const sortedIndices = softmax(similarities) |
|
.map((x, i) => [x, i]) |
|
.sort((a, b) => b[0] - a[0]); |
|
|
|
|
|
overlay.innerHTML = ""; |
|
for (const [score, index] of sortedIndices) { |
|
overlay.appendChild( |
|
document.createTextNode(`${labels[index]}: ${score.toFixed(2)}`), |
|
); |
|
overlay.appendChild(document.createElement("br")); |
|
} |
|
} |
|
|
|
if (previousTime !== undefined) { |
|
const fps = 1000 / (performance.now() - previousTime); |
|
status.textContent = `FPS: ${fps.toFixed(2)}`; |
|
} |
|
previousTime = performance.now(); |
|
isProcessing = false; |
|
})(); |
|
} |
|
|
|
window.requestAnimationFrame(onFrameUpdate); |
|
} |
|
|
|
|
|
navigator.mediaDevices |
|
.getUserMedia( |
|
{ video: true }, |
|
) |
|
.then((stream) => { |
|
|
|
video.srcObject = stream; |
|
video.play(); |
|
|
|
const videoTrack = stream.getVideoTracks()[0]; |
|
const { width, height } = videoTrack.getSettings(); |
|
|
|
video.width = width; |
|
video.height = height; |
|
|
|
|
|
const ar = width / height; |
|
const [cw, ch] = ar > 720 / 405 ? [720, 720 / ar] : [405 * ar, 405]; |
|
container.style.width = `${cw}px`; |
|
container.style.height = `${ch}px`; |
|
|
|
|
|
window.requestAnimationFrame(onFrameUpdate); |
|
}) |
|
.catch((error) => { |
|
alert(error); |
|
}); |
|
|