liuguilin commited on
Commit
107b53d
·
1 Parent(s): 16549a8
Files changed (3) hide show
  1. app.py +10 -10
  2. eagle_vl/serve/chat_utils.py +79 -7
  3. requirements.txt +4 -1
app.py CHANGED
@@ -98,7 +98,7 @@ def predict(
98
  history,
99
  top_p,
100
  temperature,
101
- max_length_tokens,
102
  max_context_length_tokens,
103
  video_nframes,
104
  chunk_size: int = 512,
@@ -113,7 +113,7 @@ def predict(
113
  top_p (float): The top-p value.
114
  temperature (float): The temperature value.
115
  repetition_penalty (float): The repetition penalty value.
116
- max_length_tokens (int): The max length tokens.
117
  max_context_length_tokens (int): The max context length tokens.
118
  chunk_size (int): The chunk size.
119
  """
@@ -171,7 +171,7 @@ def predict(
171
  model=model,
172
  processor=processor,
173
  stop_words=stop_words,
174
- max_length=max_length_tokens,
175
  temperature=temperature,
176
  top_p=top_p,
177
  video_nframes=video_nframes,
@@ -196,7 +196,7 @@ def predict(
196
  print(
197
  f"temperature: {temperature}, "
198
  f"top_p: {top_p}, "
199
- f"max_length_tokens: {max_length_tokens}"
200
  )
201
 
202
  yield gradio_chatbot_output, to_gradio_history(conversation), "Generate: Success"
@@ -209,7 +209,7 @@ def retry(
209
  history,
210
  top_p,
211
  temperature,
212
- max_length_tokens,
213
  max_context_length_tokens,
214
  video_nframes,
215
  chunk_size: int = 512,
@@ -234,7 +234,7 @@ def retry(
234
  history,
235
  top_p,
236
  temperature,
237
- max_length_tokens,
238
  max_context_length_tokens,
239
  video_nframes,
240
  chunk_size,
@@ -286,11 +286,11 @@ def build_demo(args: argparse.Namespace) -> gr.Blocks:
286
  temperature = gr.Slider(
287
  minimum=0, maximum=1.0, value=0.8, step=0.1, interactive=True, label="Temperature"
288
  )
289
- max_length_tokens = gr.Slider(
290
- minimum=512, maximum=16384, value=4096, step=64, interactive=True, label="Max Length Tokens"
291
  )
292
  max_context_length_tokens = gr.Slider(
293
- minimum=512, maximum=16384, value=4096, step=64, interactive=True, label="Max Context Length Tokens"
294
  )
295
  video_nframes = gr.Slider(
296
  minimum=1, maximum=128, value=16, step=1, interactive=True, label="Video Nframes"
@@ -310,7 +310,7 @@ def build_demo(args: argparse.Namespace) -> gr.Blocks:
310
  history,
311
  top_p,
312
  temperature,
313
- max_length_tokens,
314
  max_context_length_tokens,
315
  video_nframes
316
  ]
 
98
  history,
99
  top_p,
100
  temperature,
101
+ max_generate_length,
102
  max_context_length_tokens,
103
  video_nframes,
104
  chunk_size: int = 512,
 
113
  top_p (float): The top-p value.
114
  temperature (float): The temperature value.
115
  repetition_penalty (float): The repetition penalty value.
116
+ max_generate_length (int): The max length tokens.
117
  max_context_length_tokens (int): The max context length tokens.
118
  chunk_size (int): The chunk size.
119
  """
 
171
  model=model,
172
  processor=processor,
173
  stop_words=stop_words,
174
+ max_length=max_generate_length,
175
  temperature=temperature,
176
  top_p=top_p,
177
  video_nframes=video_nframes,
 
196
  print(
197
  f"temperature: {temperature}, "
198
  f"top_p: {top_p}, "
199
+ f"max_generate_length: {max_generate_length}"
200
  )
201
 
202
  yield gradio_chatbot_output, to_gradio_history(conversation), "Generate: Success"
 
209
  history,
210
  top_p,
211
  temperature,
212
+ max_generate_length,
213
  max_context_length_tokens,
214
  video_nframes,
215
  chunk_size: int = 512,
 
234
  history,
235
  top_p,
236
  temperature,
237
+ max_generate_length,
238
  max_context_length_tokens,
239
  video_nframes,
240
  chunk_size,
 
286
  temperature = gr.Slider(
287
  minimum=0, maximum=1.0, value=0.8, step=0.1, interactive=True, label="Temperature"
288
  )
289
+ max_generate_length = gr.Slider(
290
+ minimum=512, maximum=8192, value=4096, step=64, interactive=True, label="Max Generate Length"
291
  )
292
  max_context_length_tokens = gr.Slider(
293
+ minimum=512, maximum=65536, value=16384, step=64, interactive=True, label="Max Context Length Tokens"
294
  )
295
  video_nframes = gr.Slider(
296
  minimum=1, maximum=128, value=16, step=1, interactive=True, label="Video Nframes"
 
310
  history,
311
  top_p,
312
  temperature,
313
+ max_generate_length,
314
  max_context_length_tokens,
315
  video_nframes
316
  ]
eagle_vl/serve/chat_utils.py CHANGED
@@ -17,6 +17,44 @@ import mimetypes
17
  IMAGE_TOKEN = "<image>"
18
  logger = logging.getLogger("gradio_logger")
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  class SeparatorStyle(IntEnum):
22
  """Separator styles."""
@@ -342,6 +380,40 @@ def convert_conversation_to_prompts(conversation: Conversation):
342
  return conv_prompts, last_image
343
 
344
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  def to_gradio_chatbot(conversation: Conversation) -> list:
346
  """Convert the conversation to gradio chatbot format, supporting images and video."""
347
  ret = []
@@ -360,7 +432,7 @@ def to_gradio_chatbot(conversation: Conversation) -> list:
360
 
361
  for j, item in enumerate(items):
362
  # If string path, determine type
363
- if isinstance(item, str):
364
  mime, _ = mimetypes.guess_type(item)
365
  with open(item, "rb") as f:
366
  data = f.read()
@@ -372,15 +444,15 @@ def to_gradio_chatbot(conversation: Conversation) -> list:
372
  f'alt="user upload image_{j}" '
373
  f'style="max-width:300px;height:auto;" />'
374
  )
375
- elif mime and mime.startswith("video/"):
376
- media_str += (
377
- f'<video controls '
378
- f'style="max-width:300px;height:auto;" '
379
- f'src="data:{mime};base64,{b64}"></video>'
380
- )
381
  else:
382
  # Fallback to link
383
  media_str += f'<a href="{item}" target="_blank">{item}</a>'
 
 
 
 
 
 
384
 
385
  # If PIL image
386
  else:
 
17
  IMAGE_TOKEN = "<image>"
18
  logger = logging.getLogger("gradio_logger")
19
 
20
+ import cv2
21
+ import base64
22
+ import tempfile
23
+ import os
24
+ import imageio
25
+
26
+ def compress_video_to_base64(video_path: str, max_frames=128, resolution=(960, 540)) -> str:
27
+ cap = cv2.VideoCapture(video_path)
28
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
29
+ step = max(1, total_frames // max_frames)
30
+
31
+ frames = []
32
+ count = 0
33
+ while cap.isOpened():
34
+ ret, frame = cap.read()
35
+ if not ret:
36
+ break
37
+ if count % step == 0:
38
+ frame_resized = cv2.resize(frame, resolution)
39
+ frames.append(cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB))
40
+ count += 1
41
+ cap.release()
42
+
43
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
44
+ tmp_path = tmp.name
45
+
46
+ writer = imageio.get_writer(tmp_path, fps=10, codec='libx264', quality=8) # quality: 0(worst) - 10(best)
47
+ for f in frames:
48
+ writer.append_data(f)
49
+ writer.close()
50
+
51
+ with open(tmp_path, "rb") as f:
52
+ video_data = f.read()
53
+ os.remove(tmp_path)
54
+
55
+ return base64.b64encode(video_data).decode("utf-8")
56
+
57
+
58
 
59
  class SeparatorStyle(IntEnum):
60
  """Separator styles."""
 
380
  return conv_prompts, last_image
381
 
382
 
383
+ def to_gradio_chatbot2(conversation: Conversation) -> list:
384
+ """Convert the conversation to gradio chatbot format."""
385
+ ret = []
386
+ for i, (_, msg) in enumerate(conversation.messages[conversation.offset :]):
387
+ if i % 2 == 0:
388
+ if type(msg) is tuple:
389
+ msg, images = copy.deepcopy(msg)
390
+
391
+ if isinstance(images, list):
392
+ img_str = ""
393
+ for j, image in enumerate(images):
394
+ if isinstance(image, str):
395
+ with open(image, "rb") as f:
396
+ data = f.read()
397
+ img_b64_str = base64.b64encode(data).decode()
398
+ image_str = (
399
+ f'<img src="data:image/png;base64,{img_b64_str}" '
400
+ f'alt="user upload image" style="max-width: 300px; height: auto;" />'
401
+ )
402
+ else:
403
+ image_str = pil_to_base64(image, f"user upload image_{j}", max_size=800, min_size=400)
404
+
405
+ img_str += image_str
406
+ msg = img_str + msg
407
+ else:
408
+ pass
409
+
410
+ ret.append([msg, None])
411
+ else:
412
+ ret[-1][-1] = msg
413
+ return ret
414
+
415
+
416
+
417
  def to_gradio_chatbot(conversation: Conversation) -> list:
418
  """Convert the conversation to gradio chatbot format, supporting images and video."""
419
  ret = []
 
432
 
433
  for j, item in enumerate(items):
434
  # If string path, determine type
435
+ if isinstance(item, str) and (not item.endswith((".mp4", ".mov", ".avi", ".webm"))):
436
  mime, _ = mimetypes.guess_type(item)
437
  with open(item, "rb") as f:
438
  data = f.read()
 
444
  f'alt="user upload image_{j}" '
445
  f'style="max-width:300px;height:auto;" />'
446
  )
 
 
 
 
 
 
447
  else:
448
  # Fallback to link
449
  media_str += f'<a href="{item}" target="_blank">{item}</a>'
450
+ elif isinstance(item, str) and (item.endswith((".mp4", ".mov", ".avi", ".webm"))):
451
+ b64 = compress_video_to_base64(item)
452
+ media_str += (
453
+ f'<video controls style="max-width:300px;height:auto;" '
454
+ f'src="data:video/mp4;base64,{b64}"></video>'
455
+ )
456
 
457
  # If PIL image
458
  else:
requirements.txt CHANGED
@@ -22,4 +22,7 @@ SentencePiece
22
 
23
  # eagle
24
  peft
25
- decord
 
 
 
 
22
 
23
  # eagle
24
  peft
25
+ decord
26
+ opencv-python
27
+ imageio
28
+ imageio-ffmpeg