khang119966 commited on
Commit
45d0b80
·
verified ·
1 Parent(s): a0dae6d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -178
app.py CHANGED
@@ -11,7 +11,14 @@ from PIL import Image
11
  from torchvision.transforms.functional import InterpolationMode
12
  from transformers import AutoModel, AutoTokenizer
13
  from PIL import Image, ExifTags
14
-
 
 
 
 
 
 
 
15
  from threading import Thread
16
  import re
17
  import time
@@ -90,7 +97,7 @@ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbna
90
  if use_thumbnail and len(processed_images) != 1:
91
  thumbnail_img = image.resize((image_size, image_size))
92
  processed_images.append(thumbnail_img)
93
- return processed_images
94
 
95
  def correct_image_orientation(image_path):
96
  # Mở ảnh
@@ -114,194 +121,48 @@ def correct_image_orientation(image_path):
114
  print("Không thể xử lý Exif:", e)
115
 
116
  return image
117
-
118
- def load_image(image_file, input_size=448, max_num=12):
119
  image = correct_image_orientation(image_file).convert('RGB')
120
- print("Image size: ", image.size)
121
  transform = build_transform(input_size=input_size)
122
- images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
123
  pixel_values = [transform(image) for image in images]
124
  pixel_values = torch.stack(pixel_values)
125
- return pixel_values
126
-
 
 
 
127
  model = AutoModel.from_pretrained(
128
- "5CD-AI/Vintern-1B-v3_5",
129
  torch_dtype=torch.bfloat16,
130
  low_cpu_mem_usage=True,
131
  trust_remote_code=True,
132
  ).eval().cuda()
133
- tokenizer = AutoTokenizer.from_pretrained("5CD-AI/Vintern-1B-v3_5", trust_remote_code=True, use_fast=False)
134
 
135
  @spaces.GPU
136
- def chat(message, history):
137
- print("history",history)
138
- print("message",message)
 
 
 
 
 
 
139
 
140
- if len(history) != 0 and len(message["files"]) != 0:
141
- return """Chúng tôi hiện chỉ hổ trợ 1 ảnh ở đầu ngữ cảnh! Vui lòng tạo mới cuộc trò chuyện.
142
- We currently only support one image at the start of the context! Please start a new conversation."""
143
-
144
- if len(history) == 0 and len(message["files"]) != 0:
145
- if "path" in message["files"][0]:
146
- test_image = message["files"][0]["path"]
147
- else:
148
- test_image = message["files"][0]
149
- pixel_values = load_image(test_image, max_num=6).to(torch.bfloat16).cuda()
150
- elif len(history) == 0 and len(message["files"]) == 0:
151
- pixel_values = None
152
- elif history[0][0][0] is not None and os.path.isfile(history[0][0][0]):
153
- test_image = history[0][0][0]
154
- pixel_values = load_image(test_image, max_num=6).to(torch.bfloat16).cuda()
155
- else:
156
- pixel_values = None
157
-
158
 
159
- generation_config = dict(max_new_tokens= 700, do_sample=False, num_beams = 3, repetition_penalty=2.5)
 
 
 
 
 
 
 
 
160
 
161
- if len(history) == 0:
162
- if pixel_values is not None:
163
- question = '<image>\n'+message["text"]
164
- else:
165
- question = message["text"]
166
- response, conv_history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
167
- else:
168
- conv_history = []
169
- if history[0][0][0] is not None and os.path.isfile(history[0][0][0]):
170
- start_index = 1
171
- else:
172
- start_index = 0
173
-
174
- for i, chat_pair in enumerate(history[start_index:]):
175
- if i == 0 and start_index == 1:
176
- conv_history.append(tuple(['<image>\n'+chat_pair[0],chat_pair[1]]))
177
- else:
178
- conv_history.append(tuple(chat_pair))
179
-
180
-
181
- print("conv_history",conv_history)
182
- question = message["text"]
183
- response, conv_history = model.chat(tokenizer, pixel_values, question, generation_config, history=conv_history, return_history=True)
184
-
185
- print(f'User: {question}\nAssistant: {response}')
186
-
187
- # return response
188
- buffer = ""
189
- for new_text in response:
190
- buffer += new_text
191
- generated_text_without_prompt = buffer[:]
192
- time.sleep(0.02)
193
- yield generated_text_without_prompt
194
-
195
- CSS ="""
196
- #component-10 {
197
- height: 70dvh !important;
198
- transform-origin: top; /* Đảm bảo rằng phần tử mở rộng từ trên xuống */
199
- border-style: solid;
200
- overflow: hidden;
201
- flex-grow: 1;
202
- min-width: min(160px, 100%);
203
- border-width: var(--block-border-width);
204
- }
205
-
206
- /* Đảm bảo ảnh bên trong nút hiển thị đúng cách cho các nút có aria-label chỉ định */
207
- button.svelte-1lcyrx4[aria-label="user's message: a file of type image/jpeg, "] img.svelte-1pijsyv {
208
- width: 100%;
209
- object-fit: contain;
210
- height: 100%;
211
- border-radius: 13px; /* Thêm bo góc cho ảnh */
212
- max-width: 50vw; /* Giới hạn chiều rộng ảnh */
213
- }
214
- /* Đặt chiều cao cho nút và cho phép chọn văn bản chỉ cho các nút có aria-label chỉ định */
215
- button.svelte-1lcyrx4[aria-label="user's message: a file of type image/jpeg, "] {
216
- user-select: text;
217
- text-align: left;
218
- height: 300px;
219
- }
220
- /* Thêm bo góc và giới hạn chiều rộng cho ảnh không thuộc avatar container */
221
- .message-wrap.svelte-1lcyrx4 > div.svelte-1lcyrx4 .svelte-1lcyrx4:not(.avatar-container) img {
222
- border-radius: 13px;
223
- max-width: 50vw;
224
- }
225
- .message-wrap.svelte-1lcyrx4 .message.svelte-1lcyrx4 img {
226
- margin: var(--size-2);
227
- max-height: 500px;
228
- }
229
- .image-preview-close-button {
230
- position: relative; /* Nếu cần định vị trí */
231
- width: 5%; /* Chiều rộng nút */
232
- height: 5%; /* Chiều cao nút */
233
- display: flex;
234
- justify-content: center;
235
- align-items: center;
236
- padding: 0; /* Để tránh ảnh hưởng từ padding mặc định */
237
- border: none; /* Tùy chọn để loại bỏ đường viền */
238
- background: none; /* Tùy chọn để loại bỏ nền */
239
- }
240
-
241
- .example-image-container.svelte-9pi8y1 {
242
- width: calc(var(--size-8) * 5);
243
- height: calc(var(--size-8) * 5);
244
- border-radius: var(--radius-lg);
245
- overflow: hidden;
246
- position: relative;
247
- margin-bottom: var(--spacing-lg);
248
- }
249
- """
250
-
251
- js = """
252
- function forceLightTheme() {
253
- const url = new URL(window.location);
254
-
255
- // Cập nhật __theme thành light nếu giá trị không đúng
256
- if (url.searchParams.get('__theme') !== 'light') {
257
- url.searchParams.set('__theme', 'light');
258
- // Thay đổi URL mà không tải lại trang nếu cần
259
- window.history.replaceState({}, '', url.href);
260
- }
261
-
262
- // Đảm bảo document luôn áp dụng theme light
263
- document.documentElement.setAttribute('data-theme', 'light');
264
- }
265
- """
266
- from transformers import pipeline
267
-
268
- pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3-turbo", torch_dtype=torch.float16, device="cuda:0")
269
-
270
- @spaces.GPU
271
- def transcribe_speech(filepath):
272
- output = pipe(
273
- filepath,
274
- max_new_tokens=256,
275
- generate_kwargs={
276
- "task": "transcribe",
277
- },
278
- chunk_length_s=30,
279
- batch_size=1,
280
- )
281
- return output["text"]
282
-
283
- demo = gr.Blocks(css=CSS,js=js, theme='NoCrypt/miku')
284
-
285
- with demo:
286
- chat_demo_interface = gr.ChatInterface(
287
- fn=chat,
288
- description="""**Vintern-1B-v3.5** is the latest in the Vintern series, bringing major improvements over v2 across all benchmarks. This **continuous fine-tuning Version** enhances Vietnamese capabilities while retaining strong English performance. It excels in OCR, text recognition, and Vietnam-specific document understanding.""",
289
- examples=[{"text": "Hãy viết một email giới thiệu sản phẩm trong ảnh.", "files":["./demo_3.jpg"]},
290
- {"text": "Trích xuất các thông tin từ ảnh trả về markdown.", "files":["./demo_1.jpg"]},
291
- {"text": "Bạn là nhân viên marketing chuyên nghiệp. Hãy viết một bài quảng cáo dài trên mạng xã hội giới thiệu về cửa hàng.", "files":["./demo_2.jpg"]},
292
- {"text": "Trích xuất thông tin kiện hàng trong ảnh và trả về dạng JSON.", "files":["./demo_4.jpg"]}],
293
- title="❄️ Vintern-1B-v3.5 Demo ❄️",
294
- multimodal=True,
295
- css=CSS,
296
- js=js,
297
- theme='NoCrypt/miku'
298
- )
299
-
300
- # mic_transcribe = gr.Interface(
301
- # fn=transcribe_speech,
302
- # inputs=gr.Audio(sources="microphone", type="filepath", editable=False),
303
- # outputs=gr.components.Textbox(),
304
- # )
305
-
306
- # chat_demo_interface.queue()
307
  demo.queue().launch()
 
11
  from torchvision.transforms.functional import InterpolationMode
12
  from transformers import AutoModel, AutoTokenizer
13
  from PIL import Image, ExifTags
14
+ import cv2
15
+ import numpy as np
16
+ import torch
17
+ from html2image import Html2Image
18
+ import tempfile
19
+ import os
20
+ import uuid
21
+ from scipy.ndimage import gaussian_filter
22
  from threading import Thread
23
  import re
24
  import time
 
97
  if use_thumbnail and len(processed_images) != 1:
98
  thumbnail_img = image.resize((image_size, image_size))
99
  processed_images.append(thumbnail_img)
100
+ return processed_images, target_aspect_ratio
101
 
102
  def correct_image_orientation(image_path):
103
  # Mở ảnh
 
121
  print("Không thể xử lý Exif:", e)
122
 
123
  return image
124
+
125
+ def load_image(image_file, input_size=448, max_num=12, target_aspect_ratio=False):
126
  image = correct_image_orientation(image_file).convert('RGB')
 
127
  transform = build_transform(input_size=input_size)
128
+ images, target_aspect_ratio = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
129
  pixel_values = [transform(image) for image in images]
130
  pixel_values = torch.stack(pixel_values)
131
+ if target_aspect_ratio:
132
+ return pixel_values, target_aspect_ratio
133
+ else:
134
+ return pixel_values
135
+
136
  model = AutoModel.from_pretrained(
137
+ "khang119966/Vintern-1B-v3_5-explainableAI",
138
  torch_dtype=torch.bfloat16,
139
  low_cpu_mem_usage=True,
140
  trust_remote_code=True,
141
  ).eval().cuda()
142
+ tokenizer = AutoTokenizer.from_pretrained("khang119966/Vintern-1B-v3_5-explainableAI", trust_remote_code=True, use_fast=False)
143
 
144
  @spaces.GPU
145
+ def generate_video(image, prompt, max_tokens):
146
+ pixel_values, target_aspect_ratio = load_image(test_image, max_num=6).to(torch.bfloat16).cuda()
147
+ generation_config = dict(max_new_tokens= int(max_tokens), do_sample=False, num_beams = 3, repetition_penalty=2.5)
148
+ response, query = model.chat(tokenizer, pixel_values, '<image>\n'+prompt, generation_config, return_history=False, \
149
+ attention_visualize=True,last_visualize_layers=7,raw_image_path=test_image,target_aspect_ratio=target_aspect_ratio)
150
+ print(response)
151
+ return "path_to_generated_video.mp4"
152
+
153
+ demo = gr.Blocks(css=CSS,js=js, theme='NoCrypt/miku')
154
 
155
+ with gr.Blocks() as demo:
156
+ gr.Markdown("### Simple VLM Demo")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
+ with gr.Row():
159
+ with gr.Column():
160
+ image = gr.Image(label="Upload your image", type="pil")
161
+ prompt = gr.Textbox(label="Describe your prompt")
162
+ max_tokens = gr.Slider(label="Max token output (⚠️ Choose <100 for faster response)", minimum=1, maximum=512, value=100)
163
+ btn = gr.Button("Attenion Video")
164
+ video = gr.Video(label="Attenion Video")
165
+
166
+ btn.click(fn=generate_video, inputs=[image, prompt, max_tokens], outputs=video)
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  demo.queue().launch()