Spaces:

nadav-ed-26
/

vllm-playgroung

Running

App Files Files Community

Nadav Eden commited on Apr 2

Commit

b7a2f31

1 Parent(s): e425a6f

adding vlm support

Browse files

Files changed (3) hide show

app.py +108 -10
requirements.txt +4 -0
utils.py +42 -0

app.py CHANGED Viewed

@@ -1,7 +1,11 @@
 #!/usr/bin/env python3
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM
 llms = {
     "Qwen2-1.5B":     {"model": "Qwen/Qwen2-1.5B-Instruct", "prefix": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
@@ -12,25 +16,31 @@ llms = {
     "DeepSeek-Coder": {"model": "DeepSeek/DeepSeek-Coder", "prefix": "You are a helpful assistant."},
 }
-vlms = dict()
-def run_example(text_input, model_id="Qwen2-1.5B"):
     global messages
     tokenizer = AutoTokenizer.from_pretrained(llms[model_id]["model"], trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(llms[model_id]["model"], trust_remote_code=True)
-    system_prompt = llms[model_id]["prefix"]
     if messages is None:
         messages = [
-            {"role": "system", "content": system_prompt},
             {"role": "user", "content": text_input},
         ]
     else:
         messages.append({"role": "user", "content": text_input})
-    text = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True,
@@ -51,12 +61,86 @@ def run_example(text_input, model_id="Qwen2-1.5B"):
     return response
 messages = list()
 def reset_conversation():
     global messages
     messages = list()
 with gr.Blocks() as demo:
     gr.Markdown(
     """
@@ -74,20 +158,34 @@ with gr.Blocks() as demo:
                 model_output_text = gr.Textbox(label="Model Output Text")
-        submit_btn.click(run_example,
                          [text_input, model_selector],
                          [model_output_text])
         reset_btn.click(reset_conversation)
     with gr.Tab(label="VLM (WIP)"):
         with gr.Row():
             with gr.Column():
                input_img = gr.Image(label="Input Image", type="pil")
-               model_selector = gr.Dropdown(choices=list(vlms.keys()), label="Model", value="Qwen2-1.5B")
                text_input = gr.Textbox(label="User Prompt")
                submit_btn = gr.Button(value="Submit")
-               reset_btn = gr.Button(value="Reset conversation")
 if __name__ == "__main__":
     demo.launch()

 #!/usr/bin/env python3
 import gradio as gr
+from PIL import Image
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor, Qwen2VLForConditionalGeneration
+from utils import image_to_base64, rescale_bounding_boxes, draw_bounding_boxes, florence_draw_bboxes
+from qwen_vl_utils import process_vision_info
+import re
 llms = {
     "Qwen2-1.5B":     {"model": "Qwen/Qwen2-1.5B-Instruct", "prefix": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
     "DeepSeek-Coder": {"model": "DeepSeek/DeepSeek-Coder", "prefix": "You are a helpful assistant."},
 }
+vlms = {
+    "Florence-2-base":   {"model": "microsoft/Florence-2-base", "prefix": "help me"},
+    "Florence-2-large":  {"model": "microsoft/Florence-2-large", "prefix": "help me"},
+    "Qwen2-vl-2B":   {"model": "Qwen/Qwen2-VL-2B-Instruct", "prefix": "You are a helpfull assistant to detect objects in images. When asked to detect elements based on a description you return bounding boxes for all elements in the form of [xmin, ymin, xmax, ymax] whith the values beeing scaled to 1000 by 1000 pixels. When there are more than one result, answer with a list of bounding boxes in the form of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."},
+    "Qwen2-vl-7B":   {"model": "Qwen/Qwen2-VL-7B-Instruct", "prefix": "You are a helpfull assistant to detect objects in images. When asked to detect elements based on a description you return bounding boxes for all elements in the form of [xmin, ymin, xmax, ymax] whith the values beeing scaled to 1000 by 1000 pixels. When there are more than one result, answer with a list of bounding boxes in the form of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."},
+    "Qwen2.5-vl-3B": {"model": "Qwen/Qwen2.5-VL-3B-Instruct", "prefix": "You are a helpfull assistant to detect objects in images. When asked to detect elements based on a description you return bounding boxes for all elements in the form of [xmin, ymin, xmax, ymax] whith the values beeing scaled to 1000 by 1000 pixels. When there are more than one result, answer with a list of bounding boxes in the form of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."}
+}
+tasks = ["<OD>", "<OCR>", "<CAPTION>", "<OCR_WITH_REGION>"]
+def run_llm(text_input, model_id="Qwen2-1.5B"):
     global messages
     tokenizer = AutoTokenizer.from_pretrained(llms[model_id]["model"], trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(llms[model_id]["model"], trust_remote_code=True)
     if messages is None:
         messages = [
+            {"role": "system", "content": llms[model_id]["prefix"]},
             {"role": "user", "content": text_input},
         ]
     else:
         messages.append({"role": "user", "content": text_input})
+    text = tokenizer.apply_chat_template (
         messages,
         tokenize=False,
         add_generation_prompt=True,
     return response
+def run_vlm(image, text_input, model_id="Qwen2-vl-2B", prompt = "<OD>"):
+    if "Qwen" in model_id:
+        model = Qwen2VLForConditionalGeneration.from_pretrained(vlms[model_id]["model"], torch_dtype="auto", device_map="auto")
+    else:
+        model = AutoModelForCausalLM.from_pretrained(vlms[model_id]["model"], trust_remote_code=True)
+    processor = AutoProcessor.from_pretrained(vlms[model_id]["model"], trust_remote_code=True)
+    if "Qwen" in model_id:
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": f"data:image;base64,{image_to_base64(image)}"},
+                    {"type": "text", "text": vlms[model_id]["prefix"]},
+                    {"type": "text", "text": text_input},
+                ],
+            }
+        ]
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        ).to(model.device)
+        generated_ids = model.generate(**inputs, max_new_tokens=256)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        print(output_text)
+        pattern = r'\[\s*([.\d]+)\s*,\s*([.\d]+)\s*,\s*([.\d]+)\s*,\s*([.\d]+)\s*\]'
+        matches = re.findall(pattern, str(output_text))
+        parsed_boxes = [[float(num) for num in match] for match in matches]
+        scaled_boxes = rescale_bounding_boxes(parsed_boxes, image.width, image.height)
+        print(scaled_boxes)
+        draw = draw_bounding_boxes(image, scaled_boxes)
+    else:
+        messages = prompt + text_input
+        inputs = processor(text=messages, images=image, return_tensors="pt").to(model.device)
+        generated_ids = model.generate(
+            input_ids=inputs["input_ids"],
+            pixel_values=inputs["pixel_values"],
+            max_new_tokens=1024,
+            early_stopping=False,
+            do_sample=False,
+            num_beams=3,
+        )
+        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+        parsed_answer = processor.post_process_generation(
+            generated_text,
+            task=prompt,
+            image_size=(image.width, image.height)
+        )
+        print(parsed_answer)
+        if prompt == '<OD>':
+            parsed_boxes = parsed_answer['<OD>']['bboxes']
+            draw = florence_draw_bboxes(image, parsed_answer)
+            output_text = "None"
+        elif prompt == '<OCR>':
+            output_text = parsed_answer['<OCR>']
+            draw = image
+            parsed_boxes = None
+    return output_text, parsed_boxes, draw
 messages = list()
 def reset_conversation():
     global messages
     messages = list()
+def update_task_dropdown(model):
+    if "Florence" in model:
+        return gr.Dropdown(visible=True)
+    return gr.Dropdown(visible=False)
 with gr.Blocks() as demo:
     gr.Markdown(
     """
                 model_output_text = gr.Textbox(label="Model Output Text")
+        submit_btn.click(run_llm,
                          [text_input, model_selector],
                          [model_output_text])
         reset_btn.click(reset_conversation)
     with gr.Tab(label="VLM (WIP)"):
+    # taken from https://huggingface.co/spaces/maxiw/Qwen2-VL-Detection/blob/main/app.py
         with gr.Row():
             with gr.Column():
                input_img = gr.Image(label="Input Image", type="pil")
+               model_selector = gr.Dropdown(choices=list(vlms.keys()), label="Model", value="Florence-2-base")
+               task_select = gr.Dropdown(choices=tasks, label="task", value= "<OD>")
                text_input = gr.Textbox(label="User Prompt")
                submit_btn = gr.Button(value="Submit")
+            with gr.Column():
+                model_output_text = gr.Textbox(label="Model Output Text")
+                parsed_boxes = gr.Textbox(label="Parsed Boxes")
+                annotated_image = gr.Image(label="Annotated Image")
+            model_selector.change(update_task_dropdown, inputs=model_selector, outputs=task_select)
+        submit_btn.click(run_vlm,
+                         [input_img, text_input, model_selector, task_select],
+                         [model_output_text, parsed_boxes, annotated_image])
 if __name__ == "__main__":
     demo.launch()

requirements.txt CHANGED Viewed

@@ -1,4 +1,8 @@
 huggingface_hub==0.25.2
 torch
 transformers
 gradio

 huggingface_hub==0.25.2
 torch
+torchvision
 transformers
 gradio
+Pillow
+qwen_vl_utils
+accelerate>=0.26.0

utils.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import base64
+from PIL import ImageDraw, ImageFont
+from io import BytesIO
+def image_to_base64(image):
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    return img_str
+def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2):
+    draw = ImageDraw.Draw(image)
+    for box in bounding_boxes:
+        xmin, ymin, xmax, ymax = box
+        draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
+    return image
+def florence_draw_bboxes(image, bounding_boxes, outline_color="red", line_width=2):
+        draw = ImageDraw.Draw(image)
+        #font = ImageFont.truetype("sans-serif.ttf", 16)
+        for bbox, label in zip(bounding_boxes['<OD>']['bboxes'], bounding_boxes['<OD>']['labels']):
+            x1, y1, x2, y2 = bbox
+            draw.rectangle([x1, y1, x2, y2], outline=outline_color, width=line_width)
+            draw.text((x1, x2), label, (255,255,255))
+        return image
+def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
+    x_scale = original_width / scaled_width
+    y_scale = original_height / scaled_height
+    rescaled_boxes = []
+    for box in bounding_boxes:
+        xmin, ymin, xmax, ymax = box
+        rescaled_box = [
+            xmin * x_scale,
+            ymin * y_scale,
+            xmax * x_scale,
+            ymax * y_scale
+        ]
+        rescaled_boxes.append(rescaled_box)
+    return rescaled_boxes