EdgeTAM

Runtime error

App Files Files Community

bla commited on 7 days ago

Commit

b950bc5

verified ·

1 Parent(s): 0b34400

Update app.py

Browse files

Files changed (1) hide show

app.py +206 -75

app.py CHANGED Viewed

@@ -46,11 +46,30 @@ examples = [
 OBJ_ID = 0
-# Initialize model on CPU
 sam2_checkpoint = "checkpoints/edgetam.pt"
 model_cfg = "edgetam.yaml"
-predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
-print("predictor loaded on CPU")
 # Function to get video frame rate
 def get_video_fps(video_path):
@@ -196,10 +215,9 @@ def segment_with_points(
     print(f"TRACKING INPUT LABEL: {session_state['input_labels']}")
     # Open the image and get its dimensions
-    transparent_background = Image.fromarray(session_state["first_frame"]).convert(
-        "RGBA"
-    )
-    w, h = transparent_background.size
     # Define the circle radius as a fraction of the smaller dimension
     fraction = 0.01  # You can adjust this value as needed
@@ -225,17 +243,38 @@ def segment_with_points(
     # for labels, `1` means positive click and `0` means negative click
     labels = np.array(session_state["input_labels"], np.int32)
-    # For CPU optimization, we'll process with smaller batch size
-    _, _, out_mask_logits = predictor.add_new_points(
-        inference_state=session_state["inference_state"],
-        frame_idx=0,
-        obj_id=OBJ_ID,
-        points=points,
-        labels=labels,
-    )
-    mask_image = show_mask((out_mask_logits[0] > 0.0).cpu().numpy())
-    first_frame_output = Image.alpha_composite(transparent_background, mask_image)
     return selected_point_map, first_frame_output, session_state
@@ -247,12 +286,36 @@ def show_mask(mask, obj_id=None, random_color=False, convert_to_image=True):
         cmap = plt.get_cmap("tab10")
         cmap_idx = 0 if obj_id is None else obj_id
         color = np.array([*cmap(cmap_idx)[:3], 0.6])
-    h, w = mask.shape[-2:]
-    mask = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
-    mask = (mask * 255).astype(np.uint8)
     if convert_to_image:
-        mask = Image.fromarray(mask, "RGBA")
-    return mask
 def propagate_to_all(
@@ -270,62 +333,130 @@ def propagate_to_all(
         )
     # For CPU optimization: process in smaller batches
-    chunk_size = 5  # Process 5 frames at a time to avoid memory issues
-    # run propagation throughout the video and collect the results in a dict
-    video_segments = {}  # video_segments contains the per-frame segmentation results
-    print("starting propagate_in_video on CPU")
-    # Get the frames in chunks for CPU memory optimization
-    for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(
-        session_state["inference_state"]
-    ):
-        video_segments[out_frame_idx] = {
-            out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
-            for i, out_obj_id in enumerate(out_obj_ids)
-        }
-        # Free up memory after processing each frame
-        if len(video_segments) % chunk_size == 0:
-            torch.cuda.empty_cache() if torch.cuda.is_available() else None
-    # obtain the segmentation results every few frames
-    # For CPU optimization: increase stride to reduce processing
-    vis_frame_stride = max(1, len(video_segments) // 100)  # Limit to ~100 frames in output
-    output_frames = []
-    for out_frame_idx in range(0, len(video_segments), vis_frame_stride):
-        transparent_background = Image.fromarray(
-            session_state["all_frames"][out_frame_idx]
-        ).convert("RGBA")
-        out_mask = video_segments[out_frame_idx][OBJ_ID]
-        mask_image = show_mask(out_mask)
-        output_frame = Image.alpha_composite(transparent_background, mask_image)
-        output_frame = np.array(output_frame)
-        output_frames.append(output_frame)
-    # Create a video clip from the image sequence
-    original_fps = get_video_fps(video_in)
-    fps = original_fps  # Frames per second
-    # For CPU optimization - lower FPS if original is high
-    if fps > 24:
-        fps = 24
-    clip = ImageSequenceClip(output_frames, fps=fps)
-    # Write the result to a file - use lower quality for CPU
-    unique_id = datetime.now().strftime("%Y%m%d%H%M%S")
-    final_vid_output_path = f"output_video_{unique_id}.mp4"
-    final_vid_output_path = os.path.join(tempfile.gettempdir(), final_vid_output_path)
-    # Lower bitrate for CPU processing
-    clip.write_videofile(final_vid_output_path, codec="libx264", bitrate="1000k")
-    return (
-        gr.update(value=final_vid_output_path),
-        session_state,
-    )
 def update_ui():

 OBJ_ID = 0
+# Initialize model on CPU - add error handling for file paths
 sam2_checkpoint = "checkpoints/edgetam.pt"
 model_cfg = "edgetam.yaml"
+# Check if model files exist
+def check_file_exists(filepath):
+    import os
+    exists = os.path.exists(filepath)
+    if not exists:
+        print(f"WARNING: File not found: {filepath}")
+    return exists
+# Verify files exist
+model_files_exist = check_file_exists(sam2_checkpoint) and check_file_exists(model_cfg)
+try:
+    # Load model with more careful error handling
+    predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
+    print("predictor loaded on CPU")
+except Exception as e:
+    print(f"Error loading model: {e}")
+    import traceback
+    traceback.print_exc()
+    # Still create a predictor variable to avoid NameError
+    predictor = None
 # Function to get video frame rate
 def get_video_fps(video_path):
     print(f"TRACKING INPUT LABEL: {session_state['input_labels']}")
     # Open the image and get its dimensions
+    first_frame = session_state["first_frame"]
+    h, w = first_frame.shape[:2]
+    transparent_background = Image.fromarray(first_frame).convert("RGBA")
     # Define the circle radius as a fraction of the smaller dimension
     fraction = 0.01  # You can adjust this value as needed
     # for labels, `1` means positive click and `0` means negative click
     labels = np.array(session_state["input_labels"], np.int32)
+    try:
+        # For CPU optimization, we'll process with smaller batch size
+        _, _, out_mask_logits = predictor.add_new_points(
+            inference_state=session_state["inference_state"],
+            frame_idx=0,
+            obj_id=OBJ_ID,
+            points=points,
+            labels=labels,
+        )
+        # Create the mask
+        mask_array = (out_mask_logits[0] > 0.0).cpu().numpy()
+        # Ensure the mask has the same size as the frame
+        if mask_array.shape[:2] != (h, w):
+            mask_array = cv2.resize(
+                mask_array.astype(np.uint8),
+                (w, h),
+                interpolation=cv2.INTER_NEAREST
+            ).astype(bool)
+        mask_image = show_mask(mask_array)
+        # Make sure mask_image has the same size as the background
+        if mask_image.size != transparent_background.size:
+            mask_image = mask_image.resize(transparent_background.size, Image.NEAREST)
+        first_frame_output = Image.alpha_composite(transparent_background, mask_image)
+    except Exception as e:
+        print(f"Error in segmentation: {e}")
+        # Return just the points as fallback
+        first_frame_output = selected_point_map
     return selected_point_map, first_frame_output, session_state
         cmap = plt.get_cmap("tab10")
         cmap_idx = 0 if obj_id is None else obj_id
         color = np.array([*cmap(cmap_idx)[:3], 0.6])
+    # Handle different mask shapes properly
+    if len(mask.shape) == 2:
+        h, w = mask.shape
+    else:
+        h, w = mask.shape[-2:]
+    # Ensure correct reshaping based on mask dimensions
+    mask_reshaped = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    mask_rgba = (mask_reshaped * 255).astype(np.uint8)
     if convert_to_image:
+        try:
+            # Ensure the mask has correct RGBA shape (h, w, 4)
+            if mask_rgba.shape[2] != 4:
+                # If not RGBA, create a proper RGBA array
+                proper_mask = np.zeros((h, w, 4), dtype=np.uint8)
+                # Copy available channels
+                proper_mask[:, :, :min(mask_rgba.shape[2], 4)] = mask_rgba[:, :, :min(mask_rgba.shape[2], 4)]
+                mask_rgba = proper_mask
+            # Create the PIL image
+            return Image.fromarray(mask_rgba, "RGBA")
+        except Exception as e:
+            print(f"Error converting mask to image: {e}")
+            # Fallback: create a blank transparent image of correct size
+            blank = np.zeros((h, w, 4), dtype=np.uint8)
+            return Image.fromarray(blank, "RGBA")
+    return mask_rgba
 def propagate_to_all(
         )
     # For CPU optimization: process in smaller batches
+    chunk_size = 3  # Process 3 frames at a time to avoid memory issues on CPU
+    try:
+        # run propagation throughout the video and collect the results in a dict
+        video_segments = {}  # video_segments contains the per-frame segmentation results
+        print("starting propagate_in_video on CPU")
+        # Get the frames in chunks for CPU memory optimization
+        for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(
+            session_state["inference_state"]
+        ):
+            try:
+                # Store the masks for each object ID
+                video_segments[out_frame_idx] = {
+                    out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+                    for i, out_obj_id in enumerate(out_obj_ids)
+                }
+                print(f"Processed frame {out_frame_idx}")
+                # Release memory periodically
+                if out_frame_idx % chunk_size == 0:
+                    # Explicitly clear any tensors
+                    del out_mask_logits
+                    import gc
+                    gc.collect()
+            except Exception as e:
+                print(f"Error processing frame {out_frame_idx}: {e}")
+                continue
+        # For CPU optimization: increase stride to reduce processing
+        # Create a more aggressive stride to limit to fewer frames in output
+        total_frames = len(video_segments)
+        print(f"Total frames processed: {total_frames}")
+        # Limit to max 50 frames for CPU processing
+        max_output_frames = 50
+        vis_frame_stride = max(1, total_frames // max_output_frames)
+        # Get dimensions of the frames
+        first_frame = session_state["all_frames"][0]
+        h, w = first_frame.shape[:2]
+        output_frames = []
+        for out_frame_idx in range(0, total_frames, vis_frame_stride):
+            if out_frame_idx not in video_segments or OBJ_ID not in video_segments[out_frame_idx]:
+                continue
+            try:
+                frame = session_state["all_frames"][out_frame_idx]
+                transparent_background = Image.fromarray(frame).convert("RGBA")
+                # Get the mask and ensure it's the right size
+                out_mask = video_segments[out_frame_idx][OBJ_ID]
+                # Resize mask if dimensions don't match
+                if out_mask.shape[:2] != (h, w):
+                    out_mask = cv2.resize(
+                        out_mask.astype(np.uint8),
+                        (w, h),
+                        interpolation=cv2.INTER_NEAREST
+                    ).astype(bool)
+                mask_image = show_mask(out_mask)
+                # Make sure mask has same dimensions as background
+                if mask_image.size != transparent_background.size:
+                    mask_image = mask_image.resize(transparent_background.size, Image.NEAREST)
+                output_frame = Image.alpha_composite(transparent_background, mask_image)
+                output_frame = np.array(output_frame)
+                output_frames.append(output_frame)
+                # Clear memory periodically
+                if len(output_frames) % 10 == 0:
+                    import gc
+                    gc.collect()
+            except Exception as e:
+                print(f"Error creating output frame {out_frame_idx}: {e}")
+                continue
+        # Create a video clip from the image sequence
+        original_fps = get_video_fps(video_in)
+        fps = original_fps
+        # For CPU optimization - lower FPS if original is high
+        if fps > 15:
+            fps = 15  # Lower fps for CPU processing
+        print(f"Creating video with {len(output_frames)} frames at {fps} FPS")
+        clip = ImageSequenceClip(output_frames, fps=fps)
+        # Write the result to a file - use lower quality for CPU
+        unique_id = datetime.now().strftime("%Y%m%d%H%M%S")
+        final_vid_output_path = f"output_video_{unique_id}.mp4"
+        final_vid_output_path = os.path.join(tempfile.gettempdir(), final_vid_output_path)
+        # Lower bitrate for CPU processing
+        clip.write_videofile(
+            final_vid_output_path,
+            codec="libx264",
+            bitrate="800k",
+            threads=2,  # Use fewer threads for CPU
+            logger=None   # Disable logger to reduce console output
+        )
+        # Free memory
+        del video_segments
+        del output_frames
+        import gc
+        gc.collect()
+        return (
+            gr.update(value=final_vid_output_path, visible=True),
+            session_state,
+        )
+    except Exception as e:
+        print(f"Error in propagate_to_all: {e}")
+        return (
+            gr.update(value=None, visible=False),
+            session_state,
+        )
 def update_ui():