Explainable-Vision-Language-Model

Running on Zero

App Files Files Community

khang119966 commited on 22 days ago

Commit

45d0b80

verified ·

1 Parent(s): a0dae6d

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -178

app.py CHANGED Viewed

@@ -11,7 +11,14 @@ from PIL import Image
 from torchvision.transforms.functional import InterpolationMode
 from transformers import AutoModel, AutoTokenizer
 from PIL import Image, ExifTags
 from threading import Thread
 import re
 import time
@@ -90,7 +97,7 @@ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbna
     if use_thumbnail and len(processed_images) != 1:
         thumbnail_img = image.resize((image_size, image_size))
         processed_images.append(thumbnail_img)
-    return processed_images
 def correct_image_orientation(image_path):
     # Mở ảnh
@@ -114,194 +121,48 @@ def correct_image_orientation(image_path):
         print("Không thể xử lý Exif:", e)
     return image
-def load_image(image_file, input_size=448, max_num=12):
     image = correct_image_orientation(image_file).convert('RGB')
-    print("Image size: ", image.size)
     transform = build_transform(input_size=input_size)
-    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
     pixel_values = [transform(image) for image in images]
     pixel_values = torch.stack(pixel_values)
-    return pixel_values
 model = AutoModel.from_pretrained(
-    "5CD-AI/Vintern-1B-v3_5",
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True,
 ).eval().cuda()
-tokenizer = AutoTokenizer.from_pretrained("5CD-AI/Vintern-1B-v3_5", trust_remote_code=True, use_fast=False)
 @spaces.GPU
-def chat(message, history):
-    print("history",history)
-    print("message",message)
-    if len(history) != 0 and len(message["files"]) != 0:
-        return """Chúng tôi hiện chỉ hổ trợ 1 ảnh ở đầu ngữ cảnh! Vui lòng tạo mới cuộc trò chuyện.
-We currently only support one image at the start of the context! Please start a new conversation."""
-    if len(history) == 0 and len(message["files"]) != 0:
-        if "path" in message["files"][0]:
-            test_image = message["files"][0]["path"]
-        else:
-            test_image = message["files"][0]
-        pixel_values = load_image(test_image, max_num=6).to(torch.bfloat16).cuda()
-    elif len(history) == 0 and len(message["files"]) == 0:
-        pixel_values = None
-    elif history[0][0][0] is not None and os.path.isfile(history[0][0][0]):
-        test_image = history[0][0][0]
-        pixel_values = load_image(test_image, max_num=6).to(torch.bfloat16).cuda()
-    else:
-        pixel_values = None
-    generation_config = dict(max_new_tokens= 700, do_sample=False, num_beams = 3, repetition_penalty=2.5)
-    if len(history) == 0:
-        if pixel_values is not None:
-            question = '<image>\n'+message["text"]
-        else:
-            question = message["text"]
-        response, conv_history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
-    else:
-        conv_history = []
-        if history[0][0][0] is not None and os.path.isfile(history[0][0][0]):
-            start_index = 1
-        else:
-            start_index = 0
-        for i, chat_pair in enumerate(history[start_index:]):
-            if i == 0 and start_index == 1:
-                 conv_history.append(tuple(['<image>\n'+chat_pair[0],chat_pair[1]]))
-            else:
-                conv_history.append(tuple(chat_pair))
-        print("conv_history",conv_history)
-        question = message["text"]
-        response, conv_history = model.chat(tokenizer, pixel_values, question, generation_config, history=conv_history, return_history=True)
-    print(f'User: {question}\nAssistant: {response}')
-    # return response
-    buffer = ""
-    for new_text in response:
-      buffer += new_text
-      generated_text_without_prompt = buffer[:]
-      time.sleep(0.02)
-      yield generated_text_without_prompt
-CSS ="""
-#component-10 {
-  height: 70dvh !important;
-  transform-origin: top; /* Đảm bảo rằng phần tử mở rộng từ trên xuống */
-  border-style: solid;
-  overflow: hidden;
-  flex-grow: 1;
-  min-width: min(160px, 100%);
-  border-width: var(--block-border-width);
-}
-/* Đảm bảo ảnh bên trong nút hiển thị đúng cách cho các nút có aria-label chỉ định */
-button.svelte-1lcyrx4[aria-label="user's message: a file of type image/jpeg, "] img.svelte-1pijsyv {
-  width: 100%;
-  object-fit: contain;
-  height: 100%;
-  border-radius: 13px; /* Thêm bo góc cho ảnh */
-  max-width: 50vw;     /* Giới hạn chiều rộng ảnh */
-}
-/* Đặt chiều cao cho nút và cho phép chọn văn bản chỉ cho các nút có aria-label chỉ định */
-button.svelte-1lcyrx4[aria-label="user's message: a file of type image/jpeg, "] {
-  user-select: text;
-  text-align: left;
-  height: 300px;
-}
-/* Thêm bo góc và giới hạn chiều rộng cho ảnh không thuộc avatar container */
-.message-wrap.svelte-1lcyrx4 > div.svelte-1lcyrx4 .svelte-1lcyrx4:not(.avatar-container) img {
-  border-radius: 13px;
-  max-width: 50vw;
-}
-.message-wrap.svelte-1lcyrx4 .message.svelte-1lcyrx4 img {
-    margin: var(--size-2);
-    max-height: 500px;
-}
-.image-preview-close-button {
-  position: relative; /* Nếu cần định vị trí */
-  width: 5%; /* Chiều rộng nút */
-  height: 5%; /* Chiều cao nút */
-  display: flex;
-  justify-content: center;
-  align-items: center;
-  padding: 0; /* Để tránh ảnh hưởng từ padding mặc định */
-  border: none; /* Tùy chọn để loại bỏ đường viền */
-  background: none; /* Tùy chọn để loại bỏ nền */
-}
-.example-image-container.svelte-9pi8y1 {
-    width: calc(var(--size-8) * 5);
-    height: calc(var(--size-8) * 5);
-    border-radius: var(--radius-lg);
-    overflow: hidden;
-    position: relative;
-    margin-bottom: var(--spacing-lg);
-}
-"""
-js = """
-function forceLightTheme() {
-    const url = new URL(window.location);
-    // Cập nhật __theme thành light nếu giá trị không đúng
-    if (url.searchParams.get('__theme') !== 'light') {
-        url.searchParams.set('__theme', 'light');
-        // Thay đổi URL mà không tải lại trang nếu cần
-        window.history.replaceState({}, '', url.href);
-    }
-    // Đảm bảo document luôn áp dụng theme light
-    document.documentElement.setAttribute('data-theme', 'light');
-}
-"""
-from transformers import pipeline
-pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3-turbo", torch_dtype=torch.float16, device="cuda:0")
-@spaces.GPU
-def transcribe_speech(filepath):
-    output = pipe(
-        filepath,
-        max_new_tokens=256,
-        generate_kwargs={
-            "task": "transcribe",
-        },
-        chunk_length_s=30,
-        batch_size=1,
-    )
-    return output["text"]
-demo = gr.Blocks(css=CSS,js=js, theme='NoCrypt/miku')
-with demo:
-    chat_demo_interface = gr.ChatInterface(
-        fn=chat,
-        description="""**Vintern-1B-v3.5** is the latest in the Vintern series, bringing major improvements over v2 across all benchmarks. This **continuous fine-tuning Version** enhances Vietnamese capabilities while retaining strong English performance. It excels in OCR, text recognition, and Vietnam-specific document understanding.""",
-        examples=[{"text": "Hãy viết một email giới thiệu sản phẩm trong ảnh.", "files":["./demo_3.jpg"]},
-                  {"text": "Trích xuất các thông tin từ ảnh trả về markdown.", "files":["./demo_1.jpg"]},
-                  {"text": "Bạn là nhân viên marketing chuyên nghiệp. Hãy viết một bài quảng cáo dài trên mạng xã hội giới thiệu về cửa hàng.", "files":["./demo_2.jpg"]},
-                  {"text": "Trích xuất thông tin kiện hàng trong ảnh và trả về dạng JSON.", "files":["./demo_4.jpg"]}],
-        title="❄️ Vintern-1B-v3.5 Demo ❄️",
-        multimodal=True,
-        css=CSS,
-        js=js,
-        theme='NoCrypt/miku'
-    )
-    # mic_transcribe = gr.Interface(
-    #     fn=transcribe_speech,
-    #     inputs=gr.Audio(sources="microphone", type="filepath", editable=False),
-    #     outputs=gr.components.Textbox(),
-    # )
-# chat_demo_interface.queue()
 demo.queue().launch()

 from torchvision.transforms.functional import InterpolationMode
 from transformers import AutoModel, AutoTokenizer
 from PIL import Image, ExifTags
+import cv2
+import numpy as np
+import torch
+from html2image import Html2Image
+import tempfile
+import os
+import uuid
+from scipy.ndimage import gaussian_filter
 from threading import Thread
 import re
 import time
     if use_thumbnail and len(processed_images) != 1:
         thumbnail_img = image.resize((image_size, image_size))
         processed_images.append(thumbnail_img)
+    return processed_images, target_aspect_ratio
 def correct_image_orientation(image_path):
     # Mở ảnh
         print("Không thể xử lý Exif:", e)
     return image
+def load_image(image_file, input_size=448, max_num=12, target_aspect_ratio=False):
     image = correct_image_orientation(image_file).convert('RGB')
     transform = build_transform(input_size=input_size)
+    images, target_aspect_ratio = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
     pixel_values = [transform(image) for image in images]
     pixel_values = torch.stack(pixel_values)
+    if target_aspect_ratio:
+        return pixel_values, target_aspect_ratio
+    else:
+        return pixel_values
 model = AutoModel.from_pretrained(
+    "khang119966/Vintern-1B-v3_5-explainableAI",
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True,
     trust_remote_code=True,
 ).eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained("khang119966/Vintern-1B-v3_5-explainableAI", trust_remote_code=True, use_fast=False)
 @spaces.GPU
+def generate_video(image, prompt, max_tokens):
+    pixel_values, target_aspect_ratio = load_image(test_image, max_num=6).to(torch.bfloat16).cuda()
+    generation_config = dict(max_new_tokens= int(max_tokens), do_sample=False, num_beams = 3, repetition_penalty=2.5)
+    response, query = model.chat(tokenizer, pixel_values, '<image>\n'+prompt, generation_config, return_history=False, \
+                            attention_visualize=True,last_visualize_layers=7,raw_image_path=test_image,target_aspect_ratio=target_aspect_ratio)
+    print(response)
+    return "path_to_generated_video.mp4"
+demo = gr.Blocks(css=CSS,js=js, theme='NoCrypt/miku')
+with gr.Blocks() as demo:
+    gr.Markdown("### Simple VLM Demo")
+    with gr.Row():
+        with gr.Column():
+            image = gr.Image(label="Upload your image", type="pil")
+            prompt = gr.Textbox(label="Describe your prompt")
+            max_tokens = gr.Slider(label="Max token output (⚠️ Choose <100 for faster response)", minimum=1, maximum=512, value=100)
+            btn = gr.Button("Attenion Video")
+        video = gr.Video(label="Attenion Video")
+    btn.click(fn=generate_video, inputs=[image, prompt, max_tokens], outputs=video)
 demo.queue().launch()