Spaces:

facebook
/

EdgeTAM

Running on Zero

App Files Files Community

chongzhou commited on 2 days ago

Commit

dd6a79c

1 Parent(s): 1f52c1d

move inference_state to gr.state

Browse files

Files changed (1) hide show

app.py +249 -180

app.py CHANGED Viewed

@@ -17,7 +17,6 @@ import cv2
 import matplotlib.pyplot as plt
 import numpy as np
-import spaces
 import torch
 from moviepy.editor import ImageSequenceClip
@@ -70,11 +69,8 @@ examples = [
 ]
 OBJ_ID = 0
 sam2_checkpoint = "checkpoints/edgetam.pt"
 model_cfg = "edgetam.yaml"
-predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
-global_inference_states = {}
 def get_video_fps(video_path):
@@ -92,75 +88,82 @@ def get_video_fps(video_path):
 def reset(
-    session_first_frame,
-    session_all_frames,
-    session_input_points,
-    session_input_labels,
-    request: gr.Request,
 ):
-    session_id = request.session_hash
-    predictor.to("cpu")
-    session_input_points = []
-    session_input_labels = []
-    if global_inference_states[session_id] is not None:
-        predictor.reset_state(global_inference_states[session_id])
-    session_first_frame = None
-    session_all_frames = None
-    global_inference_states[session_id] = None
     return (
         None,
         gr.update(open=True),
         None,
         None,
         gr.update(value=None, visible=False),
-        session_first_frame,
-        session_all_frames,
-        session_input_points,
-        session_input_labels,
     )
 def clear_points(
-    session_input_points,
-    session_input_labels,
-    request: gr.Request,
 ):
-    session_id = request.session_hash
-    predictor.to("cpu")
-    session_input_points = []
-    session_input_labels = []
-    if global_inference_states[session_id]["tracking_has_started"]:
-        predictor.reset_state(global_inference_states[session_id])
     return (
-        session_first_frame,
         None,
         gr.update(value=None, visible=False),
-        session_input_points,
-        session_input_labels,
     )
 def preprocess_video_in(
     video_path,
-    session_first_frame,
-    session_all_frames,
-    session_input_points,
-    session_input_labels,
-    request: gr.Request,
 ):
-    session_id = request.session_hash
-    predictor.to("cpu")
     if video_path is None:
         return (
             gr.update(open=True),  # video_in_drawer
             None,  # points_map
             None,  # output_image
             gr.update(value=None, visible=False),  # output_video
-            session_first_frame,
-            session_all_frames,
-            session_input_points,
-            session_input_labels,
         )
     # Read the first frame
@@ -172,14 +175,19 @@ def preprocess_video_in(
             None,  # points_map
             None,  # output_image
             gr.update(value=None, visible=False),  # output_video
-            session_first_frame,
-            session_all_frames,
-            session_input_points,
-            session_input_labels,
         )
     frame_number = 0
-    first_frame = None
     all_frames = []
     while True:
@@ -192,100 +200,107 @@ def preprocess_video_in(
         # Store the first frame
         if frame_number == 0:
-            first_frame = frame
         all_frames.append(frame)
         frame_number += 1
     cap.release()
-    session_first_frame = copy.deepcopy(first_frame)
-    session_all_frames = all_frames
-    global_inference_states[session_id] = predictor.init_state(video_path=video_path)
-    session_input_points = []
-    session_input_labels = []
     return [
         gr.update(open=False),  # video_in_drawer
         first_frame,  # points_map
         None,  # output_image
         gr.update(value=None, visible=False),  # output_video
-        session_first_frame,
-        session_all_frames,
-        session_input_points,
-        session_input_labels,
     ]
-@spaces.GPU
 def segment_with_points(
     point_type,
-    session_input_points,
-    session_input_labels,
     evt: gr.SelectData,
-    request: gr.Request,
 ):
-    session_id = request.session_hash
-    if torch.cuda.get_device_properties(0).major >= 8:
-        torch.backends.cuda.matmul.allow_tf32 = True
-        torch.backends.cudnn.allow_tf32 = True
-    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
         predictor.to("cuda")
-        session_input_points.append(evt.index)
-        print(f"TRACKING INPUT POINT: {session_input_points}")
-        if point_type == "include":
-            session_input_labels.append(1)
-        elif point_type == "exclude":
-            session_input_labels.append(0)
-        print(f"TRACKING INPUT LABEL: {session_input_labels}")
-        # Open the image and get its dimensions
-        transparent_background = Image.fromarray(session_first_frame).convert("RGBA")
-        w, h = transparent_background.size
-        # Define the circle radius as a fraction of the smaller dimension
-        fraction = 0.01  # You can adjust this value as needed
-        radius = int(fraction * min(w, h))
-        # Create a transparent layer to draw on
-        transparent_layer = np.zeros((h, w, 4), dtype=np.uint8)
-        for index, track in enumerate(session_input_points):
-            if session_input_labels[index] == 1:
-                cv2.circle(transparent_layer, track, radius, (0, 255, 0, 255), -1)
-            else:
-                cv2.circle(transparent_layer, track, radius, (255, 0, 0, 255), -1)
-        # Convert the transparent layer back to an image
-        transparent_layer = Image.fromarray(transparent_layer, "RGBA")
-        selected_point_map = Image.alpha_composite(
-            transparent_background, transparent_layer
-        )
-        # Let's add a positive click at (x, y) = (210, 350) to get started
-        points = np.array(session_input_points, dtype=np.float32)
-        # for labels, `1` means positive click and `0` means negative click
-        labels = np.array(session_input_labels, dtype=np.int32)
-        _, _, out_mask_logits = predictor.add_new_points(
-            inference_state=global_inference_states[session_id],
-            frame_idx=0,
-            obj_id=OBJ_ID,
-            points=points,
-            labels=labels,
-        )
-        mask_image = show_mask((out_mask_logits[0] > 0.0).cpu().numpy())
-        first_frame_output = Image.alpha_composite(transparent_background, mask_image)
-        torch.cuda.empty_cache()
-        return (
-            selected_point_map,
-            first_frame_output,
-            session_input_points,
-            session_input_labels,
-        )
 def show_mask(mask, obj_id=None, random_color=False, convert_to_image=True):
@@ -303,69 +318,82 @@ def show_mask(mask, obj_id=None, random_color=False, convert_to_image=True):
     return mask
-@spaces.GPU
 def propagate_to_all(
     video_in,
-    session_all_frames,
-    request: gr.Request,
 ):
-    session_id = request.session_hash
-    predictor.to("cuda")
-    if torch.cuda.get_device_properties(0).major >= 8:
-        torch.backends.cuda.matmul.allow_tf32 = True
-        torch.backends.cudnn.allow_tf32 = True
-    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
-        if (
-            len(session_input_points) == 0
-            or video_in is None
-            or global_inference_states[session_id] is None
-        ):
-            return None
-        # run propagation throughout the video and collect the results in a dict
-        video_segments = (
-            {}
-        )  # video_segments contains the per-frame segmentation results
-        print("starting propagate_in_video")
-        for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(
-            global_inference_states[session_id]
-        ):
-            video_segments[out_frame_idx] = {
-                out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
-                for i, out_obj_id in enumerate(out_obj_ids)
-            }
-        # obtain the segmentation results every few frames
-        vis_frame_stride = 1
-        output_frames = []
-        for out_frame_idx in range(0, len(video_segments), vis_frame_stride):
-            transparent_background = Image.fromarray(
-                session_all_frames[out_frame_idx]
-            ).convert("RGBA")
-            out_mask = video_segments[out_frame_idx][OBJ_ID]
-            mask_image = show_mask(out_mask)
-            output_frame = Image.alpha_composite(transparent_background, mask_image)
-            output_frame = np.array(output_frame)
-            output_frames.append(output_frame)
-        torch.cuda.empty_cache()
-        # Create a video clip from the image sequence
-        original_fps = get_video_fps(video_in)
-        fps = original_fps  # Frames per second
-        clip = ImageSequenceClip(output_frames, fps=fps)
-        # Write the result to a file
-        unique_id = datetime.now().strftime("%Y%m%d%H%M%S")
-        final_vid_output_path = f"output_video_{unique_id}.mp4"
-        final_vid_output_path = os.path.join(
-            tempfile.gettempdir(), final_vid_output_path
         )
-        # Write the result to a file
-        clip.write_videofile(final_vid_output_path, codec="libx264")
-        return gr.update(value=final_vid_output_path)
 def update_ui():
@@ -377,6 +405,8 @@ with gr.Blocks() as demo:
     all_frames = gr.State(None)
     input_points = gr.State([])
     input_labels = gr.State([])
     with gr.Column():
         # Title
@@ -430,6 +460,8 @@ with gr.Blocks() as demo:
             all_frames,
             input_points,
             input_labels,
         ],
         outputs=[
             video_in_drawer,  # Accordion to hide uploaded video player
@@ -440,6 +472,8 @@ with gr.Blocks() as demo:
             all_frames,
             input_points,
             input_labels,
         ],
         queue=False,
     )
@@ -452,6 +486,8 @@ with gr.Blocks() as demo:
             all_frames,
             input_points,
             input_labels,
         ],
         outputs=[
             video_in_drawer,  # Accordion to hide uploaded video player
@@ -462,6 +498,8 @@ with gr.Blocks() as demo:
             all_frames,
             input_points,
             input_labels,
         ],
         queue=False,
     )
@@ -471,14 +509,22 @@ with gr.Blocks() as demo:
         fn=segment_with_points,
         inputs=[
             point_type,  # "include" or "exclude"
             input_points,
             input_labels,
         ],
         outputs=[
             points_map,  # updated image with points
             output_image,
             input_points,
             input_labels,
         ],
         queue=False,
     )
@@ -487,15 +533,23 @@ with gr.Blocks() as demo:
     clear_points_btn.click(
         fn=clear_points,
         inputs=[
             input_points,
             input_labels,
         ],
         outputs=[
             points_map,
             output_image,
             output_video,
             input_points,
             input_labels,
         ],
         queue=False,
     )
@@ -507,6 +561,8 @@ with gr.Blocks() as demo:
             all_frames,
             input_points,
             input_labels,
         ],
         outputs=[
             video_in,
@@ -518,6 +574,8 @@ with gr.Blocks() as demo:
             all_frames,
             input_points,
             input_labels,
         ],
         queue=False,
     )
@@ -531,10 +589,21 @@ with gr.Blocks() as demo:
         fn=propagate_to_all,
         inputs=[
             video_in,
             all_frames,
         ],
         outputs=[
             output_video,
         ],
         concurrency_limit=10,
         queue=False,

 import matplotlib.pyplot as plt
 import numpy as np
 import torch
 from moviepy.editor import ImageSequenceClip
 ]
 OBJ_ID = 0
 sam2_checkpoint = "checkpoints/edgetam.pt"
 model_cfg = "edgetam.yaml"
 def get_video_fps(video_path):
 def reset(
+    first_frame,
+    all_frames,
+    input_points,
+    input_labels,
+    inference_state,
+    predictor,
 ):
+    first_frame = None
+    all_frames = None
+    input_points = []
+    input_labels = []
+    if inference_state and predictor:
+        predictor.reset_state(inference_state)
+    inference_state = None
     return (
         None,
         gr.update(open=True),
         None,
         None,
         gr.update(value=None, visible=False),
+        first_frame,
+        all_frames,
+        input_points,
+        input_labels,
+        inference_state,
+        predictor,
     )
 def clear_points(
+    first_frame,
+    all_frames,
+    input_points,
+    input_labels,
+    inference_state,
+    predictor,
 ):
+    input_points = []
+    input_labels = []
+    if inference_state and predictor and inference_state["tracking_has_started"]:
+        predictor.reset_state(inference_state)
     return (
+        first_frame,
         None,
         gr.update(value=None, visible=False),
+        first_frame,
+        all_frames,
+        input_points,
+        input_labels,
+        inference_state,
+        predictor,
     )
 def preprocess_video_in(
     video_path,
+    first_frame,
+    all_frames,
+    input_points,
+    input_labels,
+    inference_state,
+    predictor,
 ):
     if video_path is None:
         return (
             gr.update(open=True),  # video_in_drawer
             None,  # points_map
             None,  # output_image
             gr.update(value=None, visible=False),  # output_video
+            first_frame,
+            all_frames,
+            input_points,
+            input_labels,
+            inference_state,
+            predictor,
         )
     # Read the first frame
             None,  # points_map
             None,  # output_image
             gr.update(value=None, visible=False),  # output_video
+            first_frame,
+            all_frames,
+            input_points,
+            input_labels,
+            inference_state,
+            predictor,
         )
+    if predictor is None:
+        predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
     frame_number = 0
+    _first_frame = None
     all_frames = []
     while True:
         # Store the first frame
         if frame_number == 0:
+            _first_frame = frame
         all_frames.append(frame)
         frame_number += 1
     cap.release()
+    first_frame = copy.deepcopy(_first_frame)
+    inference_state = predictor.init_state(video_path=video_path)
+    input_points = []
+    input_labels = []
     return [
         gr.update(open=False),  # video_in_drawer
         first_frame,  # points_map
         None,  # output_image
         gr.update(value=None, visible=False),  # output_video
+        first_frame,
+        all_frames,
+        input_points,
+        input_labels,
+        inference_state,
+        predictor,
     ]
 def segment_with_points(
     point_type,
+    first_frame,
+    all_frames,
+    input_points,
+    input_labels,
+    inference_state,
+    predictor,
     evt: gr.SelectData,
 ):
+    if torch.cuda.is_available():
         predictor.to("cuda")
+        inference_state["device"] = "cuda"
+        if torch.cuda.get_device_properties(0).major >= 8:
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
+        torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
+    input_points.append(evt.index)
+    print(f"TRACKING INPUT POINT: {input_points}")
+    if point_type == "include":
+        input_labels.append(1)
+    elif point_type == "exclude":
+        input_labels.append(0)
+    print(f"TRACKING INPUT LABEL: {input_labels}")
+    # Open the image and get its dimensions
+    transparent_background = Image.fromarray(first_frame).convert("RGBA")
+    w, h = transparent_background.size
+    # Define the circle radius as a fraction of the smaller dimension
+    fraction = 0.01  # You can adjust this value as needed
+    radius = int(fraction * min(w, h))
+    # Create a transparent layer to draw on
+    transparent_layer = np.zeros((h, w, 4), dtype=np.uint8)
+    for index, track in enumerate(input_points):
+        if input_labels[index] == 1:
+            cv2.circle(transparent_layer, track, radius, (0, 255, 0, 255), -1)
+        else:
+            cv2.circle(transparent_layer, track, radius, (255, 0, 0, 255), -1)
+    # Convert the transparent layer back to an image
+    transparent_layer = Image.fromarray(transparent_layer, "RGBA")
+    selected_point_map = Image.alpha_composite(
+        transparent_background, transparent_layer
+    )
+    # Let's add a positive click at (x, y) = (210, 350) to get started
+    points = np.array(input_points, dtype=np.float32)
+    # for labels, `1` means positive click and `0` means negative click
+    labels = np.array(input_labels, dtype=np.int32)
+    _, _, out_mask_logits = predictor.add_new_points(
+        inference_state=inference_state,
+        frame_idx=0,
+        obj_id=OBJ_ID,
+        points=points,
+        labels=labels,
+    )
+    mask_image = show_mask((out_mask_logits[0] > 0.0).cpu().numpy())
+    first_frame_output = Image.alpha_composite(transparent_background, mask_image)
+    torch.cuda.empty_cache()
+    return (
+        selected_point_map,
+        first_frame_output,
+        first_frame,
+        all_frames,
+        input_points,
+        input_labels,
+        inference_state,
+        predictor,
+    )
 def show_mask(mask, obj_id=None, random_color=False, convert_to_image=True):
     return mask
 def propagate_to_all(
     video_in,
+    first_frame,
+    all_frames,
+    input_points,
+    input_labels,
+    inference_state,
+    predictor,
 ):
+    if torch.cuda.is_available():
+        predictor.to("cuda")
+        inference_state["device"] = "cuda"
+        if torch.cuda.get_device_properties(0).major >= 8:
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
+        torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
+    if len(input_points) == 0 or video_in is None or inference_state is None:
+        return None
+    # run propagation throughout the video and collect the results in a dict
+    video_segments = {}  # video_segments contains the per-frame segmentation results
+    print("starting propagate_in_video")
+    for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(
+        inference_state
+    ):
+        video_segments[out_frame_idx] = {
+            out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+            for i, out_obj_id in enumerate(out_obj_ids)
+        }
+    # obtain the segmentation results every few frames
+    vis_frame_stride = 1
+    output_frames = []
+    for out_frame_idx in range(0, len(video_segments), vis_frame_stride):
+        transparent_background = Image.fromarray(all_frames[out_frame_idx]).convert(
+            "RGBA"
         )
+        out_mask = video_segments[out_frame_idx][OBJ_ID]
+        mask_image = show_mask(out_mask)
+        output_frame = Image.alpha_composite(transparent_background, mask_image)
+        output_frame = np.array(output_frame)
+        output_frames.append(output_frame)
+    torch.cuda.empty_cache()
+    # Create a video clip from the image sequence
+    original_fps = get_video_fps(video_in)
+    fps = original_fps  # Frames per second
+    clip = ImageSequenceClip(output_frames, fps=fps)
+    # Write the result to a file
+    unique_id = datetime.now().strftime("%Y%m%d%H%M%S")
+    final_vid_output_path = f"output_video_{unique_id}.mp4"
+    final_vid_output_path = os.path.join(tempfile.gettempdir(), final_vid_output_path)
+    # Write the result to a file
+    clip.write_videofile(final_vid_output_path, codec="libx264")
+    return (
+        gr.update(value=final_vid_output_path),
+        first_frame,
+        all_frames,
+        input_points,
+        input_labels,
+        inference_state,
+        predictor,
+    )
+try:
+    from spaces import GPU
+    segment_with_points = GPU(segment_with_points)
+    propagate_to_all = GPU(propagate_to_all)
+except:
+    print("spaces unavailable")
 def update_ui():
     all_frames = gr.State(None)
     input_points = gr.State([])
     input_labels = gr.State([])
+    inference_state = gr.State(None)
+    predictor = gr.State(None)
     with gr.Column():
         # Title
             all_frames,
             input_points,
             input_labels,
+            inference_state,
+            predictor,
         ],
         outputs=[
             video_in_drawer,  # Accordion to hide uploaded video player
             all_frames,
             input_points,
             input_labels,
+            inference_state,
+            predictor,
         ],
         queue=False,
     )
             all_frames,
             input_points,
             input_labels,
+            inference_state,
+            predictor,
         ],
         outputs=[
             video_in_drawer,  # Accordion to hide uploaded video player
             all_frames,
             input_points,
             input_labels,
+            inference_state,
+            predictor,
         ],
         queue=False,
     )
         fn=segment_with_points,
         inputs=[
             point_type,  # "include" or "exclude"
+            first_frame,
+            all_frames,
             input_points,
             input_labels,
+            inference_state,
+            predictor,
         ],
         outputs=[
             points_map,  # updated image with points
             output_image,
+            first_frame,
+            all_frames,
             input_points,
             input_labels,
+            inference_state,
+            predictor,
         ],
         queue=False,
     )
     clear_points_btn.click(
         fn=clear_points,
         inputs=[
+            first_frame,
+            all_frames,
             input_points,
             input_labels,
+            inference_state,
+            predictor,
         ],
         outputs=[
             points_map,
             output_image,
             output_video,
+            first_frame,
+            all_frames,
             input_points,
             input_labels,
+            inference_state,
+            predictor,
         ],
         queue=False,
     )
             all_frames,
             input_points,
             input_labels,
+            inference_state,
+            predictor,
         ],
         outputs=[
             video_in,
             all_frames,
             input_points,
             input_labels,
+            inference_state,
+            predictor,
         ],
         queue=False,
     )
         fn=propagate_to_all,
         inputs=[
             video_in,
+            first_frame,
             all_frames,
+            input_points,
+            input_labels,
+            inference_state,
+            predictor,
         ],
         outputs=[
             output_video,
+            first_frame,
+            all_frames,
+            input_points,
+            input_labels,
+            inference_state,
+            predictor,
         ],
         concurrency_limit=10,
         queue=False,