File size: 5,823 Bytes
e352103
8640a7d
36be50d
e352103
 
 
cf8e08c
 
18c7142
 
a1d286e
 
 
 
 
 
 
 
e352103
 
7d72183
e352103
de4762a
a1d286e
 
 
7d72183
 
a1d286e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d72183
a1d286e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18c7142
 
e352103
a1d286e
 
 
 
 
 
de4762a
36be50d
 
 
 
6d64276
36be50d
a1d286e
36be50d
a1d286e
 
 
 
36be50d
 
 
c728a92
7d72183
334a9a6
 
1f3cc30
3851363
36be50d
a1d286e
 
36be50d
18c7142
 
de4762a
18c7142
36be50d
 
a1d286e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
from threading import Thread
import re
import time
import torch
import spaces
import subprocess
from io import BytesIO

subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct")
model = AutoModelForImageTextToText.from_pretrained(
    "HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
    _attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16
).to("cuda:0")


@spaces.GPU
def model_inference(
    input_dict, history, max_tokens
):

    text = input_dict["text"].strip()
    user_content = []
    media_queue = []

    for file_path in input_dict.get("files", []):
        if file_path.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
            media_queue.append({"type": "image", "path": file_path})
        elif file_path.lower().endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
            media_queue.append({"type": "video", "path": file_path})


    if not text and not media_queue:
        gr.Warning("Please input a query and optionally image(s)/video(s).")
        return

    if not text and media_queue:
        gr.Warning("Please input a text query along with the image(s)/video(s).")
        return


    if "<image>" in text or "<video>" in text:
        parts = re.split(r'(<image>|<video>)', text)
        temp_media_queue = list(media_queue)
        for part in parts:
            if part == "<image>" and temp_media_queue:
                media_item = temp_media_queue.pop(0)
                if media_item["type"] == "image":
                    user_content.append(media_item)
                else:
                    gr.Warning(f"Placeholder <image> found, but next media is a video: {media_item['path']}. Skipping placeholder.")
                    user_content.append({"type": "text", "text": part})
                    temp_media_queue.insert(0, media_item)
            elif part == "<video>" and temp_media_queue:
                media_item = temp_media_queue.pop(0)
                if media_item["type"] == "video":
                     user_content.append(media_item)
                else:
                    gr.Warning(f"Placeholder <video> found, but next media is an image: {media_item['path']}. Skipping placeholder.")
                    user_content.append({"type": "text", "text": part})
                    temp_media_queue.insert(0, media_item)
            elif part.strip():
                user_content.append({"type": "text", "text": part.strip()})
            elif part in ["<image>", "<video>"] and not temp_media_queue:
                 gr.Warning(f"Placeholder {part} found, but no more media items available.")
                 user_content.append({"type": "text", "text": part})

        user_content.extend(temp_media_queue)

    else:
        if text:
             user_content.append({"type": "text", "text": text})
        user_content.extend(media_queue)


    resulting_messages = [{"role": "user", "content": user_content}]


    print("resulting_messages", resulting_messages)

    try:
        inputs = processor.apply_chat_template(
            resulting_messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        )
    except Exception as e:
        gr.Error(f"Error during input processing: {e}")
        print(f"Processor Error: {e}")
        print("Problematic message structure:", resulting_messages)
        return


    inputs = inputs.to(model.device)


    if "pixel_values" in inputs:
        inputs["pixel_values"] = inputs["pixel_values"].to(model.dtype)


    streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
    generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
    generated_text = ""

    thread = Thread(target=model.generate, kwargs=generation_args)
    thread.start()

    buffer = ""

    for new_text in streamer:
        buffer += new_text
        yield buffer

    thread.join()


examples=[
              [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
              [{"text": "What art era this artpiece <image> and this artpiece <image> belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}],
              [{"text": "Describe this image.", "files": ["example_images/mosque.jpg"]}],
              [{"text": "When was this purchase made and how much did it cost?", "files": ["example_images/fiche.jpg"]}],
              [{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
              [{"text": "What is happening in the video?", "files": ["example_images/short.mp4"]}],
      ]
demo = gr.ChatInterface(fn=model_inference, title="SmolVLM2: The Smollest Video Model Ever 📺",
                description="Play with [SmolVLM2-256M-Video-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-256M-Video-Instruct) in this demo. To get started, upload an image/video and text or try one of the examples. This demo doesn't use history for the chat, so every chat you start is a new conversation.",
                examples=examples,
                textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"), stop_btn="Stop Generation", multimodal=True,
                cache_examples=False,
                additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
                type="messages"
                )



demo.launch(debug=True)