Spaces:

pdufour
/

Qwen2VL_TransformersJS_Demo

Running

File size: 3,466 Bytes

54648ea

import { AutoProcessor, Qwen2VLForConditionalGeneration, RawImage } from "@huggingface/transformers";


const EXAMPLE_URL = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg";

const exampleButton = document.getElementById('example');
const promptInput = document.querySelector('input[type="text"]');
const status = document.getElementById('status');
const thumb = document.getElementById('thumb');
const uploadInput = document.getElementById('upload');
const form = document.getElementById('form');
const output = document.getElementById('llm-output');

let currentImage = '';
let currentQuery = '';
const model_id = "onnx-community/Qwen2-VL-2B-Instruct";
let processor;
let model;

async function initializeSessions() {
  status.textContent = 'Loading model...';
  container.classList.add('disabled');

  processor = await AutoProcessor.from_pretrained(model_id);
  model = await Qwen2VLForConditionalGeneration.from_pretrained(model_id, { dtype: 'q4f16', device: 'webgpu' });

  status.textContent = 'Ready';
  status.classList.add('ready');

  uploadInput.disabled = false;
  promptInput.disabled = false;
  container.classList.remove('disabled');
}

async function handleQuery(imageUrl, query) {
  try {
    status.textContent = 'Analyzing...';

    const result = await imageTextToText(imageUrl, query, (out) => {
      console.log({ out });
      output.textContent = out;
    });
  } catch (err) {
    status.textContent = 'Error processing request';
    console.error(err);
  }
}


export async function imageTextToText(
  imagePath,
  query,
  cb,
) {

  const image = await (await RawImage.read(imagePath)).resize(448, 448);
  const conversation = [
    {
      role: "user",
      content: [
        { type: "image" },
        { type: "text", text: query, },
      ],
      images: [image],
    },
  ];
  const text = processor.apply_chat_template(conversation, { add_generation_prompt: true });
  const inputs = await processor(text, image);

  const outputs = await model.generate({
    ...inputs,
    max_new_tokens: 128,
  });

  const decoded = processor.batch_decode(
    outputs.slice(null, [inputs.input_ids.dims.at(-1), null]),
    { skip_special_tokens: true },
  );

  cb(decoded);

  return decoded;
}

async function updatePreview(url) {
  const image = await RawImage.fromURL(url);
  const ar = image.width / image.height;
  const [cw, ch] = (ar > 1) ? [320, 320 / ar] : [320 * ar, 320];
  thumb.style.width = `${cw}px`;
  thumb.style.height = `${ch}px`;
  thumb.style.backgroundImage = `url(${url})`;
  thumb.innerHTML = '';
}

await initializeSessions();

// UI Event Handlers
exampleButton.addEventListener('click', (e) => {
  e.preventDefault();
  currentImage = EXAMPLE_URL;
  updatePreview(currentImage);
});

uploadInput.addEventListener('change', (e) => {
  const file = e.target.files[0];
  if (!file) return;

  const reader = new FileReader();
  reader.onload = (e2) => {
    currentImage = e2.target.result;
    updatePreview(currentImage);
  };
  reader.readAsDataURL(file);
});

promptInput.addEventListener('keypress', (e) => {
  currentQuery = e.target.value;
});

form.addEventListener('submit', (e) => {
  e.preventDefault();

  if (!currentImage || !currentQuery) {
    status.textContent = 'Please select an image and type a prompt';
  } else {
    promptInput.disabled = true;
    uploadInput.disabled = true;
    handleQuery(currentImage, currentQuery);
  }
});