Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -11,7 +11,14 @@ from PIL import Image
|
|
11 |
from torchvision.transforms.functional import InterpolationMode
|
12 |
from transformers import AutoModel, AutoTokenizer
|
13 |
from PIL import Image, ExifTags
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
from threading import Thread
|
16 |
import re
|
17 |
import time
|
@@ -90,7 +97,7 @@ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbna
|
|
90 |
if use_thumbnail and len(processed_images) != 1:
|
91 |
thumbnail_img = image.resize((image_size, image_size))
|
92 |
processed_images.append(thumbnail_img)
|
93 |
-
return processed_images
|
94 |
|
95 |
def correct_image_orientation(image_path):
|
96 |
# Mở ảnh
|
@@ -114,194 +121,48 @@ def correct_image_orientation(image_path):
|
|
114 |
print("Không thể xử lý Exif:", e)
|
115 |
|
116 |
return image
|
117 |
-
|
118 |
-
def load_image(image_file, input_size=448, max_num=12):
|
119 |
image = correct_image_orientation(image_file).convert('RGB')
|
120 |
-
print("Image size: ", image.size)
|
121 |
transform = build_transform(input_size=input_size)
|
122 |
-
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
|
123 |
pixel_values = [transform(image) for image in images]
|
124 |
pixel_values = torch.stack(pixel_values)
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
127 |
model = AutoModel.from_pretrained(
|
128 |
-
"
|
129 |
torch_dtype=torch.bfloat16,
|
130 |
low_cpu_mem_usage=True,
|
131 |
trust_remote_code=True,
|
132 |
).eval().cuda()
|
133 |
-
tokenizer = AutoTokenizer.from_pretrained("
|
134 |
|
135 |
@spaces.GPU
|
136 |
-
def
|
137 |
-
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
|
140 |
-
|
141 |
-
|
142 |
-
We currently only support one image at the start of the context! Please start a new conversation."""
|
143 |
-
|
144 |
-
if len(history) == 0 and len(message["files"]) != 0:
|
145 |
-
if "path" in message["files"][0]:
|
146 |
-
test_image = message["files"][0]["path"]
|
147 |
-
else:
|
148 |
-
test_image = message["files"][0]
|
149 |
-
pixel_values = load_image(test_image, max_num=6).to(torch.bfloat16).cuda()
|
150 |
-
elif len(history) == 0 and len(message["files"]) == 0:
|
151 |
-
pixel_values = None
|
152 |
-
elif history[0][0][0] is not None and os.path.isfile(history[0][0][0]):
|
153 |
-
test_image = history[0][0][0]
|
154 |
-
pixel_values = load_image(test_image, max_num=6).to(torch.bfloat16).cuda()
|
155 |
-
else:
|
156 |
-
pixel_values = None
|
157 |
-
|
158 |
|
159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
|
161 |
-
if len(history) == 0:
|
162 |
-
if pixel_values is not None:
|
163 |
-
question = '<image>\n'+message["text"]
|
164 |
-
else:
|
165 |
-
question = message["text"]
|
166 |
-
response, conv_history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
|
167 |
-
else:
|
168 |
-
conv_history = []
|
169 |
-
if history[0][0][0] is not None and os.path.isfile(history[0][0][0]):
|
170 |
-
start_index = 1
|
171 |
-
else:
|
172 |
-
start_index = 0
|
173 |
-
|
174 |
-
for i, chat_pair in enumerate(history[start_index:]):
|
175 |
-
if i == 0 and start_index == 1:
|
176 |
-
conv_history.append(tuple(['<image>\n'+chat_pair[0],chat_pair[1]]))
|
177 |
-
else:
|
178 |
-
conv_history.append(tuple(chat_pair))
|
179 |
-
|
180 |
-
|
181 |
-
print("conv_history",conv_history)
|
182 |
-
question = message["text"]
|
183 |
-
response, conv_history = model.chat(tokenizer, pixel_values, question, generation_config, history=conv_history, return_history=True)
|
184 |
-
|
185 |
-
print(f'User: {question}\nAssistant: {response}')
|
186 |
-
|
187 |
-
# return response
|
188 |
-
buffer = ""
|
189 |
-
for new_text in response:
|
190 |
-
buffer += new_text
|
191 |
-
generated_text_without_prompt = buffer[:]
|
192 |
-
time.sleep(0.02)
|
193 |
-
yield generated_text_without_prompt
|
194 |
-
|
195 |
-
CSS ="""
|
196 |
-
#component-10 {
|
197 |
-
height: 70dvh !important;
|
198 |
-
transform-origin: top; /* Đảm bảo rằng phần tử mở rộng từ trên xuống */
|
199 |
-
border-style: solid;
|
200 |
-
overflow: hidden;
|
201 |
-
flex-grow: 1;
|
202 |
-
min-width: min(160px, 100%);
|
203 |
-
border-width: var(--block-border-width);
|
204 |
-
}
|
205 |
-
|
206 |
-
/* Đảm bảo ảnh bên trong nút hiển thị đúng cách cho các nút có aria-label chỉ định */
|
207 |
-
button.svelte-1lcyrx4[aria-label="user's message: a file of type image/jpeg, "] img.svelte-1pijsyv {
|
208 |
-
width: 100%;
|
209 |
-
object-fit: contain;
|
210 |
-
height: 100%;
|
211 |
-
border-radius: 13px; /* Thêm bo góc cho ảnh */
|
212 |
-
max-width: 50vw; /* Giới hạn chiều rộng ảnh */
|
213 |
-
}
|
214 |
-
/* Đặt chiều cao cho nút và cho phép chọn văn bản chỉ cho các nút có aria-label chỉ định */
|
215 |
-
button.svelte-1lcyrx4[aria-label="user's message: a file of type image/jpeg, "] {
|
216 |
-
user-select: text;
|
217 |
-
text-align: left;
|
218 |
-
height: 300px;
|
219 |
-
}
|
220 |
-
/* Thêm bo góc và giới hạn chiều rộng cho ảnh không thuộc avatar container */
|
221 |
-
.message-wrap.svelte-1lcyrx4 > div.svelte-1lcyrx4 .svelte-1lcyrx4:not(.avatar-container) img {
|
222 |
-
border-radius: 13px;
|
223 |
-
max-width: 50vw;
|
224 |
-
}
|
225 |
-
.message-wrap.svelte-1lcyrx4 .message.svelte-1lcyrx4 img {
|
226 |
-
margin: var(--size-2);
|
227 |
-
max-height: 500px;
|
228 |
-
}
|
229 |
-
.image-preview-close-button {
|
230 |
-
position: relative; /* Nếu cần định vị trí */
|
231 |
-
width: 5%; /* Chiều rộng nút */
|
232 |
-
height: 5%; /* Chiều cao nút */
|
233 |
-
display: flex;
|
234 |
-
justify-content: center;
|
235 |
-
align-items: center;
|
236 |
-
padding: 0; /* Để tránh ảnh hưởng từ padding mặc định */
|
237 |
-
border: none; /* Tùy chọn để loại bỏ đường viền */
|
238 |
-
background: none; /* Tùy chọn để loại bỏ nền */
|
239 |
-
}
|
240 |
-
|
241 |
-
.example-image-container.svelte-9pi8y1 {
|
242 |
-
width: calc(var(--size-8) * 5);
|
243 |
-
height: calc(var(--size-8) * 5);
|
244 |
-
border-radius: var(--radius-lg);
|
245 |
-
overflow: hidden;
|
246 |
-
position: relative;
|
247 |
-
margin-bottom: var(--spacing-lg);
|
248 |
-
}
|
249 |
-
"""
|
250 |
-
|
251 |
-
js = """
|
252 |
-
function forceLightTheme() {
|
253 |
-
const url = new URL(window.location);
|
254 |
-
|
255 |
-
// Cập nhật __theme thành light nếu giá trị không đúng
|
256 |
-
if (url.searchParams.get('__theme') !== 'light') {
|
257 |
-
url.searchParams.set('__theme', 'light');
|
258 |
-
// Thay đổi URL mà không tải lại trang nếu cần
|
259 |
-
window.history.replaceState({}, '', url.href);
|
260 |
-
}
|
261 |
-
|
262 |
-
// Đảm bảo document luôn áp dụng theme light
|
263 |
-
document.documentElement.setAttribute('data-theme', 'light');
|
264 |
-
}
|
265 |
-
"""
|
266 |
-
from transformers import pipeline
|
267 |
-
|
268 |
-
pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3-turbo", torch_dtype=torch.float16, device="cuda:0")
|
269 |
-
|
270 |
-
@spaces.GPU
|
271 |
-
def transcribe_speech(filepath):
|
272 |
-
output = pipe(
|
273 |
-
filepath,
|
274 |
-
max_new_tokens=256,
|
275 |
-
generate_kwargs={
|
276 |
-
"task": "transcribe",
|
277 |
-
},
|
278 |
-
chunk_length_s=30,
|
279 |
-
batch_size=1,
|
280 |
-
)
|
281 |
-
return output["text"]
|
282 |
-
|
283 |
-
demo = gr.Blocks(css=CSS,js=js, theme='NoCrypt/miku')
|
284 |
-
|
285 |
-
with demo:
|
286 |
-
chat_demo_interface = gr.ChatInterface(
|
287 |
-
fn=chat,
|
288 |
-
description="""**Vintern-1B-v3.5** is the latest in the Vintern series, bringing major improvements over v2 across all benchmarks. This **continuous fine-tuning Version** enhances Vietnamese capabilities while retaining strong English performance. It excels in OCR, text recognition, and Vietnam-specific document understanding.""",
|
289 |
-
examples=[{"text": "Hãy viết một email giới thiệu sản phẩm trong ảnh.", "files":["./demo_3.jpg"]},
|
290 |
-
{"text": "Trích xuất các thông tin từ ảnh trả về markdown.", "files":["./demo_1.jpg"]},
|
291 |
-
{"text": "Bạn là nhân viên marketing chuyên nghiệp. Hãy viết một bài quảng cáo dài trên mạng xã hội giới thiệu về cửa hàng.", "files":["./demo_2.jpg"]},
|
292 |
-
{"text": "Trích xuất thông tin kiện hàng trong ảnh và trả về dạng JSON.", "files":["./demo_4.jpg"]}],
|
293 |
-
title="❄️ Vintern-1B-v3.5 Demo ❄️",
|
294 |
-
multimodal=True,
|
295 |
-
css=CSS,
|
296 |
-
js=js,
|
297 |
-
theme='NoCrypt/miku'
|
298 |
-
)
|
299 |
-
|
300 |
-
# mic_transcribe = gr.Interface(
|
301 |
-
# fn=transcribe_speech,
|
302 |
-
# inputs=gr.Audio(sources="microphone", type="filepath", editable=False),
|
303 |
-
# outputs=gr.components.Textbox(),
|
304 |
-
# )
|
305 |
-
|
306 |
-
# chat_demo_interface.queue()
|
307 |
demo.queue().launch()
|
|
|
11 |
from torchvision.transforms.functional import InterpolationMode
|
12 |
from transformers import AutoModel, AutoTokenizer
|
13 |
from PIL import Image, ExifTags
|
14 |
+
import cv2
|
15 |
+
import numpy as np
|
16 |
+
import torch
|
17 |
+
from html2image import Html2Image
|
18 |
+
import tempfile
|
19 |
+
import os
|
20 |
+
import uuid
|
21 |
+
from scipy.ndimage import gaussian_filter
|
22 |
from threading import Thread
|
23 |
import re
|
24 |
import time
|
|
|
97 |
if use_thumbnail and len(processed_images) != 1:
|
98 |
thumbnail_img = image.resize((image_size, image_size))
|
99 |
processed_images.append(thumbnail_img)
|
100 |
+
return processed_images, target_aspect_ratio
|
101 |
|
102 |
def correct_image_orientation(image_path):
|
103 |
# Mở ảnh
|
|
|
121 |
print("Không thể xử lý Exif:", e)
|
122 |
|
123 |
return image
|
124 |
+
|
125 |
+
def load_image(image_file, input_size=448, max_num=12, target_aspect_ratio=False):
|
126 |
image = correct_image_orientation(image_file).convert('RGB')
|
|
|
127 |
transform = build_transform(input_size=input_size)
|
128 |
+
images, target_aspect_ratio = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
|
129 |
pixel_values = [transform(image) for image in images]
|
130 |
pixel_values = torch.stack(pixel_values)
|
131 |
+
if target_aspect_ratio:
|
132 |
+
return pixel_values, target_aspect_ratio
|
133 |
+
else:
|
134 |
+
return pixel_values
|
135 |
+
|
136 |
model = AutoModel.from_pretrained(
|
137 |
+
"khang119966/Vintern-1B-v3_5-explainableAI",
|
138 |
torch_dtype=torch.bfloat16,
|
139 |
low_cpu_mem_usage=True,
|
140 |
trust_remote_code=True,
|
141 |
).eval().cuda()
|
142 |
+
tokenizer = AutoTokenizer.from_pretrained("khang119966/Vintern-1B-v3_5-explainableAI", trust_remote_code=True, use_fast=False)
|
143 |
|
144 |
@spaces.GPU
|
145 |
+
def generate_video(image, prompt, max_tokens):
|
146 |
+
pixel_values, target_aspect_ratio = load_image(test_image, max_num=6).to(torch.bfloat16).cuda()
|
147 |
+
generation_config = dict(max_new_tokens= int(max_tokens), do_sample=False, num_beams = 3, repetition_penalty=2.5)
|
148 |
+
response, query = model.chat(tokenizer, pixel_values, '<image>\n'+prompt, generation_config, return_history=False, \
|
149 |
+
attention_visualize=True,last_visualize_layers=7,raw_image_path=test_image,target_aspect_ratio=target_aspect_ratio)
|
150 |
+
print(response)
|
151 |
+
return "path_to_generated_video.mp4"
|
152 |
+
|
153 |
+
demo = gr.Blocks(css=CSS,js=js, theme='NoCrypt/miku')
|
154 |
|
155 |
+
with gr.Blocks() as demo:
|
156 |
+
gr.Markdown("### Simple VLM Demo")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
+
with gr.Row():
|
159 |
+
with gr.Column():
|
160 |
+
image = gr.Image(label="Upload your image", type="pil")
|
161 |
+
prompt = gr.Textbox(label="Describe your prompt")
|
162 |
+
max_tokens = gr.Slider(label="Max token output (⚠️ Choose <100 for faster response)", minimum=1, maximum=512, value=100)
|
163 |
+
btn = gr.Button("Attenion Video")
|
164 |
+
video = gr.Video(label="Attenion Video")
|
165 |
+
|
166 |
+
btn.click(fn=generate_video, inputs=[image, prompt, max_tokens], outputs=video)
|
167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
demo.queue().launch()
|