Spaces:
Running
on
Zero
Running
on
Zero
update
Browse files- app.py +10 -10
- eagle_vl/serve/chat_utils.py +79 -7
- requirements.txt +4 -1
app.py
CHANGED
@@ -98,7 +98,7 @@ def predict(
|
|
98 |
history,
|
99 |
top_p,
|
100 |
temperature,
|
101 |
-
|
102 |
max_context_length_tokens,
|
103 |
video_nframes,
|
104 |
chunk_size: int = 512,
|
@@ -113,7 +113,7 @@ def predict(
|
|
113 |
top_p (float): The top-p value.
|
114 |
temperature (float): The temperature value.
|
115 |
repetition_penalty (float): The repetition penalty value.
|
116 |
-
|
117 |
max_context_length_tokens (int): The max context length tokens.
|
118 |
chunk_size (int): The chunk size.
|
119 |
"""
|
@@ -171,7 +171,7 @@ def predict(
|
|
171 |
model=model,
|
172 |
processor=processor,
|
173 |
stop_words=stop_words,
|
174 |
-
max_length=
|
175 |
temperature=temperature,
|
176 |
top_p=top_p,
|
177 |
video_nframes=video_nframes,
|
@@ -196,7 +196,7 @@ def predict(
|
|
196 |
print(
|
197 |
f"temperature: {temperature}, "
|
198 |
f"top_p: {top_p}, "
|
199 |
-
f"
|
200 |
)
|
201 |
|
202 |
yield gradio_chatbot_output, to_gradio_history(conversation), "Generate: Success"
|
@@ -209,7 +209,7 @@ def retry(
|
|
209 |
history,
|
210 |
top_p,
|
211 |
temperature,
|
212 |
-
|
213 |
max_context_length_tokens,
|
214 |
video_nframes,
|
215 |
chunk_size: int = 512,
|
@@ -234,7 +234,7 @@ def retry(
|
|
234 |
history,
|
235 |
top_p,
|
236 |
temperature,
|
237 |
-
|
238 |
max_context_length_tokens,
|
239 |
video_nframes,
|
240 |
chunk_size,
|
@@ -286,11 +286,11 @@ def build_demo(args: argparse.Namespace) -> gr.Blocks:
|
|
286 |
temperature = gr.Slider(
|
287 |
minimum=0, maximum=1.0, value=0.8, step=0.1, interactive=True, label="Temperature"
|
288 |
)
|
289 |
-
|
290 |
-
minimum=512, maximum=
|
291 |
)
|
292 |
max_context_length_tokens = gr.Slider(
|
293 |
-
minimum=512, maximum=
|
294 |
)
|
295 |
video_nframes = gr.Slider(
|
296 |
minimum=1, maximum=128, value=16, step=1, interactive=True, label="Video Nframes"
|
@@ -310,7 +310,7 @@ def build_demo(args: argparse.Namespace) -> gr.Blocks:
|
|
310 |
history,
|
311 |
top_p,
|
312 |
temperature,
|
313 |
-
|
314 |
max_context_length_tokens,
|
315 |
video_nframes
|
316 |
]
|
|
|
98 |
history,
|
99 |
top_p,
|
100 |
temperature,
|
101 |
+
max_generate_length,
|
102 |
max_context_length_tokens,
|
103 |
video_nframes,
|
104 |
chunk_size: int = 512,
|
|
|
113 |
top_p (float): The top-p value.
|
114 |
temperature (float): The temperature value.
|
115 |
repetition_penalty (float): The repetition penalty value.
|
116 |
+
max_generate_length (int): The max length tokens.
|
117 |
max_context_length_tokens (int): The max context length tokens.
|
118 |
chunk_size (int): The chunk size.
|
119 |
"""
|
|
|
171 |
model=model,
|
172 |
processor=processor,
|
173 |
stop_words=stop_words,
|
174 |
+
max_length=max_generate_length,
|
175 |
temperature=temperature,
|
176 |
top_p=top_p,
|
177 |
video_nframes=video_nframes,
|
|
|
196 |
print(
|
197 |
f"temperature: {temperature}, "
|
198 |
f"top_p: {top_p}, "
|
199 |
+
f"max_generate_length: {max_generate_length}"
|
200 |
)
|
201 |
|
202 |
yield gradio_chatbot_output, to_gradio_history(conversation), "Generate: Success"
|
|
|
209 |
history,
|
210 |
top_p,
|
211 |
temperature,
|
212 |
+
max_generate_length,
|
213 |
max_context_length_tokens,
|
214 |
video_nframes,
|
215 |
chunk_size: int = 512,
|
|
|
234 |
history,
|
235 |
top_p,
|
236 |
temperature,
|
237 |
+
max_generate_length,
|
238 |
max_context_length_tokens,
|
239 |
video_nframes,
|
240 |
chunk_size,
|
|
|
286 |
temperature = gr.Slider(
|
287 |
minimum=0, maximum=1.0, value=0.8, step=0.1, interactive=True, label="Temperature"
|
288 |
)
|
289 |
+
max_generate_length = gr.Slider(
|
290 |
+
minimum=512, maximum=8192, value=4096, step=64, interactive=True, label="Max Generate Length"
|
291 |
)
|
292 |
max_context_length_tokens = gr.Slider(
|
293 |
+
minimum=512, maximum=65536, value=16384, step=64, interactive=True, label="Max Context Length Tokens"
|
294 |
)
|
295 |
video_nframes = gr.Slider(
|
296 |
minimum=1, maximum=128, value=16, step=1, interactive=True, label="Video Nframes"
|
|
|
310 |
history,
|
311 |
top_p,
|
312 |
temperature,
|
313 |
+
max_generate_length,
|
314 |
max_context_length_tokens,
|
315 |
video_nframes
|
316 |
]
|
eagle_vl/serve/chat_utils.py
CHANGED
@@ -17,6 +17,44 @@ import mimetypes
|
|
17 |
IMAGE_TOKEN = "<image>"
|
18 |
logger = logging.getLogger("gradio_logger")
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
class SeparatorStyle(IntEnum):
|
22 |
"""Separator styles."""
|
@@ -342,6 +380,40 @@ def convert_conversation_to_prompts(conversation: Conversation):
|
|
342 |
return conv_prompts, last_image
|
343 |
|
344 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
345 |
def to_gradio_chatbot(conversation: Conversation) -> list:
|
346 |
"""Convert the conversation to gradio chatbot format, supporting images and video."""
|
347 |
ret = []
|
@@ -360,7 +432,7 @@ def to_gradio_chatbot(conversation: Conversation) -> list:
|
|
360 |
|
361 |
for j, item in enumerate(items):
|
362 |
# If string path, determine type
|
363 |
-
if isinstance(item, str):
|
364 |
mime, _ = mimetypes.guess_type(item)
|
365 |
with open(item, "rb") as f:
|
366 |
data = f.read()
|
@@ -372,15 +444,15 @@ def to_gradio_chatbot(conversation: Conversation) -> list:
|
|
372 |
f'alt="user upload image_{j}" '
|
373 |
f'style="max-width:300px;height:auto;" />'
|
374 |
)
|
375 |
-
elif mime and mime.startswith("video/"):
|
376 |
-
media_str += (
|
377 |
-
f'<video controls '
|
378 |
-
f'style="max-width:300px;height:auto;" '
|
379 |
-
f'src="data:{mime};base64,{b64}"></video>'
|
380 |
-
)
|
381 |
else:
|
382 |
# Fallback to link
|
383 |
media_str += f'<a href="{item}" target="_blank">{item}</a>'
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
|
385 |
# If PIL image
|
386 |
else:
|
|
|
17 |
IMAGE_TOKEN = "<image>"
|
18 |
logger = logging.getLogger("gradio_logger")
|
19 |
|
20 |
+
import cv2
|
21 |
+
import base64
|
22 |
+
import tempfile
|
23 |
+
import os
|
24 |
+
import imageio
|
25 |
+
|
26 |
+
def compress_video_to_base64(video_path: str, max_frames=128, resolution=(960, 540)) -> str:
|
27 |
+
cap = cv2.VideoCapture(video_path)
|
28 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
29 |
+
step = max(1, total_frames // max_frames)
|
30 |
+
|
31 |
+
frames = []
|
32 |
+
count = 0
|
33 |
+
while cap.isOpened():
|
34 |
+
ret, frame = cap.read()
|
35 |
+
if not ret:
|
36 |
+
break
|
37 |
+
if count % step == 0:
|
38 |
+
frame_resized = cv2.resize(frame, resolution)
|
39 |
+
frames.append(cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB))
|
40 |
+
count += 1
|
41 |
+
cap.release()
|
42 |
+
|
43 |
+
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
|
44 |
+
tmp_path = tmp.name
|
45 |
+
|
46 |
+
writer = imageio.get_writer(tmp_path, fps=10, codec='libx264', quality=8) # quality: 0(worst) - 10(best)
|
47 |
+
for f in frames:
|
48 |
+
writer.append_data(f)
|
49 |
+
writer.close()
|
50 |
+
|
51 |
+
with open(tmp_path, "rb") as f:
|
52 |
+
video_data = f.read()
|
53 |
+
os.remove(tmp_path)
|
54 |
+
|
55 |
+
return base64.b64encode(video_data).decode("utf-8")
|
56 |
+
|
57 |
+
|
58 |
|
59 |
class SeparatorStyle(IntEnum):
|
60 |
"""Separator styles."""
|
|
|
380 |
return conv_prompts, last_image
|
381 |
|
382 |
|
383 |
+
def to_gradio_chatbot2(conversation: Conversation) -> list:
|
384 |
+
"""Convert the conversation to gradio chatbot format."""
|
385 |
+
ret = []
|
386 |
+
for i, (_, msg) in enumerate(conversation.messages[conversation.offset :]):
|
387 |
+
if i % 2 == 0:
|
388 |
+
if type(msg) is tuple:
|
389 |
+
msg, images = copy.deepcopy(msg)
|
390 |
+
|
391 |
+
if isinstance(images, list):
|
392 |
+
img_str = ""
|
393 |
+
for j, image in enumerate(images):
|
394 |
+
if isinstance(image, str):
|
395 |
+
with open(image, "rb") as f:
|
396 |
+
data = f.read()
|
397 |
+
img_b64_str = base64.b64encode(data).decode()
|
398 |
+
image_str = (
|
399 |
+
f'<img src="data:image/png;base64,{img_b64_str}" '
|
400 |
+
f'alt="user upload image" style="max-width: 300px; height: auto;" />'
|
401 |
+
)
|
402 |
+
else:
|
403 |
+
image_str = pil_to_base64(image, f"user upload image_{j}", max_size=800, min_size=400)
|
404 |
+
|
405 |
+
img_str += image_str
|
406 |
+
msg = img_str + msg
|
407 |
+
else:
|
408 |
+
pass
|
409 |
+
|
410 |
+
ret.append([msg, None])
|
411 |
+
else:
|
412 |
+
ret[-1][-1] = msg
|
413 |
+
return ret
|
414 |
+
|
415 |
+
|
416 |
+
|
417 |
def to_gradio_chatbot(conversation: Conversation) -> list:
|
418 |
"""Convert the conversation to gradio chatbot format, supporting images and video."""
|
419 |
ret = []
|
|
|
432 |
|
433 |
for j, item in enumerate(items):
|
434 |
# If string path, determine type
|
435 |
+
if isinstance(item, str) and (not item.endswith((".mp4", ".mov", ".avi", ".webm"))):
|
436 |
mime, _ = mimetypes.guess_type(item)
|
437 |
with open(item, "rb") as f:
|
438 |
data = f.read()
|
|
|
444 |
f'alt="user upload image_{j}" '
|
445 |
f'style="max-width:300px;height:auto;" />'
|
446 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
447 |
else:
|
448 |
# Fallback to link
|
449 |
media_str += f'<a href="{item}" target="_blank">{item}</a>'
|
450 |
+
elif isinstance(item, str) and (item.endswith((".mp4", ".mov", ".avi", ".webm"))):
|
451 |
+
b64 = compress_video_to_base64(item)
|
452 |
+
media_str += (
|
453 |
+
f'<video controls style="max-width:300px;height:auto;" '
|
454 |
+
f'src="data:video/mp4;base64,{b64}"></video>'
|
455 |
+
)
|
456 |
|
457 |
# If PIL image
|
458 |
else:
|
requirements.txt
CHANGED
@@ -22,4 +22,7 @@ SentencePiece
|
|
22 |
|
23 |
# eagle
|
24 |
peft
|
25 |
-
decord
|
|
|
|
|
|
|
|
22 |
|
23 |
# eagle
|
24 |
peft
|
25 |
+
decord
|
26 |
+
opencv-python
|
27 |
+
imageio
|
28 |
+
imageio-ffmpeg
|