Spaces:

nvidia
/

Eagle2-Demo

Running on Zero

App Files Files Community

liuguilin commited on 10 days ago

Commit

107b53d

1 Parent(s): 16549a8

update

Browse files

Files changed (3) hide show

app.py +10 -10
eagle_vl/serve/chat_utils.py +79 -7
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -98,7 +98,7 @@ def predict(
     history,
     top_p,
     temperature,
-    max_length_tokens,
     max_context_length_tokens,
     video_nframes,
     chunk_size: int = 512,
@@ -113,7 +113,7 @@ def predict(
         top_p (float): The top-p value.
         temperature (float): The temperature value.
         repetition_penalty (float): The repetition penalty value.
-        max_length_tokens (int): The max length tokens.
         max_context_length_tokens (int): The max context length tokens.
         chunk_size (int): The chunk size.
     """
@@ -171,7 +171,7 @@ def predict(
             model=model,
             processor=processor,
             stop_words=stop_words,
-            max_length=max_length_tokens,
             temperature=temperature,
             top_p=top_p,
             video_nframes=video_nframes,
@@ -196,7 +196,7 @@ def predict(
         print(
             f"temperature: {temperature}, "
             f"top_p: {top_p}, "
-            f"max_length_tokens: {max_length_tokens}"
         )
     yield gradio_chatbot_output, to_gradio_history(conversation), "Generate: Success"
@@ -209,7 +209,7 @@ def retry(
     history,
     top_p,
     temperature,
-    max_length_tokens,
     max_context_length_tokens,
     video_nframes,
     chunk_size: int = 512,
@@ -234,7 +234,7 @@ def retry(
         history,
         top_p,
         temperature,
-        max_length_tokens,
         max_context_length_tokens,
         video_nframes,
         chunk_size,
@@ -286,11 +286,11 @@ def build_demo(args: argparse.Namespace) -> gr.Blocks:
                     temperature = gr.Slider(
                         minimum=0, maximum=1.0, value=0.8, step=0.1, interactive=True, label="Temperature"
                     )
-                    max_length_tokens = gr.Slider(
-                        minimum=512, maximum=16384, value=4096, step=64, interactive=True, label="Max Length Tokens"
                     )
                     max_context_length_tokens = gr.Slider(
-                        minimum=512, maximum=16384, value=4096, step=64, interactive=True, label="Max Context Length Tokens"
                     )
                     video_nframes = gr.Slider(
                         minimum=1, maximum=128, value=16, step=1, interactive=True, label="Video Nframes"
@@ -310,7 +310,7 @@ def build_demo(args: argparse.Namespace) -> gr.Blocks:
             history,
             top_p,
             temperature,
-            max_length_tokens,
             max_context_length_tokens,
             video_nframes
         ]

     history,
     top_p,
     temperature,
+    max_generate_length,
     max_context_length_tokens,
     video_nframes,
     chunk_size: int = 512,
         top_p (float): The top-p value.
         temperature (float): The temperature value.
         repetition_penalty (float): The repetition penalty value.
+        max_generate_length (int): The max length tokens.
         max_context_length_tokens (int): The max context length tokens.
         chunk_size (int): The chunk size.
     """
             model=model,
             processor=processor,
             stop_words=stop_words,
+            max_length=max_generate_length,
             temperature=temperature,
             top_p=top_p,
             video_nframes=video_nframes,
         print(
             f"temperature: {temperature}, "
             f"top_p: {top_p}, "
+            f"max_generate_length: {max_generate_length}"
         )
     yield gradio_chatbot_output, to_gradio_history(conversation), "Generate: Success"
     history,
     top_p,
     temperature,
+    max_generate_length,
     max_context_length_tokens,
     video_nframes,
     chunk_size: int = 512,
         history,
         top_p,
         temperature,
+        max_generate_length,
         max_context_length_tokens,
         video_nframes,
         chunk_size,
                     temperature = gr.Slider(
                         minimum=0, maximum=1.0, value=0.8, step=0.1, interactive=True, label="Temperature"
                     )
+                    max_generate_length = gr.Slider(
+                        minimum=512, maximum=8192, value=4096, step=64, interactive=True, label="Max Generate Length"
                     )
                     max_context_length_tokens = gr.Slider(
+                        minimum=512, maximum=65536, value=16384, step=64, interactive=True, label="Max Context Length Tokens"
                     )
                     video_nframes = gr.Slider(
                         minimum=1, maximum=128, value=16, step=1, interactive=True, label="Video Nframes"
             history,
             top_p,
             temperature,
+            max_generate_length,
             max_context_length_tokens,
             video_nframes
         ]

eagle_vl/serve/chat_utils.py CHANGED Viewed

@@ -17,6 +17,44 @@ import mimetypes
 IMAGE_TOKEN = "<image>"
 logger = logging.getLogger("gradio_logger")
 class SeparatorStyle(IntEnum):
     """Separator styles."""
@@ -342,6 +380,40 @@ def convert_conversation_to_prompts(conversation: Conversation):
     return conv_prompts, last_image
 def to_gradio_chatbot(conversation: Conversation) -> list:
     """Convert the conversation to gradio chatbot format, supporting images and video."""
     ret = []
@@ -360,7 +432,7 @@ def to_gradio_chatbot(conversation: Conversation) -> list:
                 for j, item in enumerate(items):
                     # If string path, determine type
-                    if isinstance(item, str):
                         mime, _ = mimetypes.guess_type(item)
                         with open(item, "rb") as f:
                             data = f.read()
@@ -372,15 +444,15 @@ def to_gradio_chatbot(conversation: Conversation) -> list:
                                 f'alt="user upload image_{j}" '
                                 f'style="max-width:300px;height:auto;" />'
                             )
-                        elif mime and mime.startswith("video/"):
-                            media_str += (
-                                f'<video controls '
-                                f'style="max-width:300px;height:auto;" '
-                                f'src="data:{mime};base64,{b64}"></video>'
-                            )
                         else:
                             # Fallback to link
                             media_str += f'<a href="{item}" target="_blank">{item}</a>'
                     # If PIL image
                     else:

 IMAGE_TOKEN = "<image>"
 logger = logging.getLogger("gradio_logger")
+import cv2
+import base64
+import tempfile
+import os
+import imageio
+def compress_video_to_base64(video_path: str, max_frames=128, resolution=(960, 540)) -> str:
+    cap = cv2.VideoCapture(video_path)
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    step = max(1, total_frames // max_frames)
+    frames = []
+    count = 0
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if count % step == 0:
+            frame_resized = cv2.resize(frame, resolution)
+            frames.append(cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB))
+        count += 1
+    cap.release()
+    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
+        tmp_path = tmp.name
+    writer = imageio.get_writer(tmp_path, fps=10, codec='libx264', quality=8)  # quality: 0(worst) - 10(best)
+    for f in frames:
+        writer.append_data(f)
+    writer.close()
+    with open(tmp_path, "rb") as f:
+        video_data = f.read()
+    os.remove(tmp_path)
+    return base64.b64encode(video_data).decode("utf-8")
 class SeparatorStyle(IntEnum):
     """Separator styles."""
     return conv_prompts, last_image
+def to_gradio_chatbot2(conversation: Conversation) -> list:
+    """Convert the conversation to gradio chatbot format."""
+    ret = []
+    for i, (_, msg) in enumerate(conversation.messages[conversation.offset :]):
+        if i % 2 == 0:
+            if type(msg) is tuple:
+                msg, images = copy.deepcopy(msg)
+                if isinstance(images, list):
+                    img_str = ""
+                    for j, image in enumerate(images):
+                        if isinstance(image, str):
+                            with open(image, "rb") as f:
+                                data = f.read()
+                            img_b64_str = base64.b64encode(data).decode()
+                            image_str = (
+                                f'<img src="data:image/png;base64,{img_b64_str}" '
+                                f'alt="user upload image" style="max-width: 300px; height: auto;" />'
+                            )
+                        else:
+                            image_str = pil_to_base64(image, f"user upload image_{j}", max_size=800, min_size=400)
+                        img_str += image_str
+                    msg = img_str + msg
+                else:
+                    pass
+            ret.append([msg, None])
+        else:
+            ret[-1][-1] = msg
+    return ret
 def to_gradio_chatbot(conversation: Conversation) -> list:
     """Convert the conversation to gradio chatbot format, supporting images and video."""
     ret = []
                 for j, item in enumerate(items):
                     # If string path, determine type
+                    if isinstance(item, str) and (not item.endswith((".mp4", ".mov", ".avi", ".webm"))):
                         mime, _ = mimetypes.guess_type(item)
                         with open(item, "rb") as f:
                             data = f.read()
                                 f'alt="user upload image_{j}" '
                                 f'style="max-width:300px;height:auto;" />'
                             )
                         else:
                             # Fallback to link
                             media_str += f'<a href="{item}" target="_blank">{item}</a>'
+                    elif isinstance(item, str) and (item.endswith((".mp4", ".mov", ".avi", ".webm"))):
+                        b64 = compress_video_to_base64(item)
+                        media_str += (
+                            f'<video controls style="max-width:300px;height:auto;" '
+                            f'src="data:video/mp4;base64,{b64}"></video>'
+                        )
                     # If PIL image
                     else:

requirements.txt CHANGED Viewed

@@ -22,4 +22,7 @@ SentencePiece
 # eagle
 peft
-decord

 # eagle
 peft
+decord
+opencv-python
+imageio
+imageio-ffmpeg