EdgeTAM

Runtime error

App Files Files Community

bla commited on 8 days ago

Commit

e508568

verified ·

1 Parent(s): fa0b563

Update app.py

Browse files

Files changed (1) hide show

app.py +177 -220

app.py CHANGED Viewed

@@ -10,16 +10,16 @@ from datetime import datetime
 import gradio as gr
-# Removed GPU-specific environment variable setting
-# os.environ["TORCH_CUDNN_SDPA_ENABLED"] = "0,1,2,3,4,5,6,7"
 import tempfile
 import cv2
 import matplotlib.pyplot as plt
 import numpy as np
-# Removed spaces decorator import for CPU-only demo
-# import spaces
 import torch
 from moviepy.editor import ImageSequenceClip
@@ -38,7 +38,7 @@ description_p = """# Instructions
                 </ol>
               """
-# examples - Keep examples, they are input files
 examples = [
     ["examples/01_dog.mp4"],
     ["examples/02_cups.mp4"],
@@ -75,77 +75,133 @@ OBJ_ID = 0
 sam2_checkpoint = "checkpoints/edgetam.pt"
 model_cfg = "edgetam.yaml"
-# Ensure predictor is explicitly built for CPU
-# The device is set here and with .to("cpu")
 predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
-predictor.to("cpu") # Explicitly move to CPU after building
-print("predictor loaded on CPU")
-# Removed autocast block for maximum CPU compatibility
-# torch.autocast(device_type="cpu", dtype=torch.bfloat16).__enter__()
-# Removed commented-out GPU-specific code
-# if torch.cuda.get_device_properties(0).major >= 8: ...
 def get_video_fps(video_path):
-    """Gets the frames per second of a video file."""
-    if video_path is None or not os.path.exists(video_path):
-         print(f"Warning: Video file not found at {video_path}")
-         return None
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
-        print(f"Error: Could not open video file {video_path}.")
         return None
     fps = cap.get(cv2.CAP_PROP_FPS)
-    cap.release()
     return fps
-# Removed @spaces.GPU decorator
 def preprocess_video_in(video_path, session_state):
     """Loads video frames and initializes the predictor state."""
     print(f"Processing video: {video_path}")
     if video_path is None or not os.path.exists(video_path):
         print("No video path provided or file not found.")
         # Reset state and UI elements if input is invalid
         return (
-            gr.update(open=True),  # video_in_drawer
-            None,  # points_map
-            None,  # output_image
-            gr.update(value=None, visible=False),  # output_video
-            gr.update(interactive=False), # propagate_btn
-            gr.update(interactive=False), # clear_points_btn
-            gr.update(interactive=False), # reset_btn
-            { # Reset session state
-                "first_frame": None,
-                "all_frames": None,
-                "input_points": [],
-                "input_labels": [],
-                "inference_state": None,
-                "video_path": None,
             }
         )
-    # Read the first frame and all frames
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
         print(f"Error: Could not open video file {video_path}.")
-        # Reset state and UI elements on error
         return (
-            gr.update(open=True),
-            None,
-            None,
-            gr.update(value=None, visible=False),
-             gr.update(interactive=False), # propagate_btn
-            gr.update(interactive=False), # clear_points_btn
-            gr.update(interactive=False), # reset_btn
             { # Reset session state
-                "first_frame": None,
-                "all_frames": None,
-                "input_points": [],
-                "input_labels": [],
-                "inference_state": None,
-                "video_path": None,
             }
         )
@@ -156,139 +212,65 @@ def preprocess_video_in(video_path, session_state):
         ret, frame = cap.read()
         if not ret:
             break
-        # Convert BGR to RGB
         frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
         all_frames.append(frame)
         if first_frame is None:
-            first_frame = frame # Store the first frame
     cap.release()
     if not all_frames:
         print(f"Error: No frames read from video file {video_path}.")
-         # Reset state and UI elements if no frames are read
         return (
-            gr.update(open=True),
-            None,
-            None,
-            gr.update(value=None, visible=False),
-            gr.update(interactive=False), # propagate_btn
-            gr.update(interactive=False), # clear_points_btn
-            gr.update(interactive=False), # reset_btn
             { # Reset session state
-                "first_frame": None,
-                "all_frames": None,
-                "input_points": [],
-                "input_labels": [],
-                "inference_state": None,
-                "video_path": None,
             }
         )
-    # Update session state with frames and path
-    session_state["first_frame"] = copy.deepcopy(first_frame) # Store a copy
     session_state["all_frames"] = all_frames
-    session_state["video_path"] = video_path # Store the path
     session_state["input_points"] = []
     session_state["input_labels"] = []
-    # Initialize state *without* the device argument
     session_state["inference_state"] = predictor.init_state(video_path=video_path)
     print("Video loaded and predictor state initialized.")
     return [
         gr.update(open=False),  # video_in_drawer
-        first_frame,  # points_map (shows first frame)
-        None,  # output_image (cleared initially)
-        gr.update(value=None, visible=False),  # output_video (hidden initially)
-        gr.update(interactive=True), # Enable buttons
-        gr.update(interactive=True), # Enable buttons
-        gr.update(interactive=True), # Enable buttons
-        session_state, # Updated state
     ]
-def reset(session_state):
-    """Resets the UI and session state."""
-    print("Resetting demo.")
-    # Clear points and labels
-    session_state["input_points"] = []
-    session_state["input_labels"] = []
-    # Reset the predictor state if it exists
-    if session_state["inference_state"] is not None:
-        predictor.reset_state(session_state["inference_state"])
-        # After reset, we also discard the state object as a new video might be loaded
-        session_state["inference_state"] = None
-    # Clear frames and video path
-    session_state["first_frame"] = None
-    session_state["all_frames"] = None
-    session_state["video_path"] = None
-    # Update UI elements to their initial state
-    return (
-        None, # video_in
-        gr.update(open=True), # video_in_drawer open
-        None, # points_map cleared
-        None, # output_image cleared
-        gr.update(value=None, visible=False), # output_video hidden
-        gr.update(interactive=False), # Disable buttons
-        gr.update(interactive=False), # Disable buttons
-        gr.update(interactive=False), # Disable buttons
-        session_state, # Updated session state
-    )
-def clear_points(session_state):
-    """Clears selected points and resets segmentation on the first frame."""
-    print("Clearing points.")
-    # Clear points and labels lists
-    session_state["input_points"] = []
-    session_state["input_labels"] = []
-    # Reset the predictor state if it exists. This clears internal masks/features
-    # but keeps the video context initialized by preprocess_video_in.
-    if session_state["inference_state"] is not None:
-        predictor.reset_state(session_state["inference_state"])
-         # After resetting the state, if we still have the video path, re-initialize the state
-         # to be ready for new points on the same video.
-        if session_state["video_path"] is not None:
-             # Re-initialize state *without* the device argument
-             session_state["inference_state"] = predictor.init_state(video_path=session_state["video_path"])
-             print("Predictor state re-initialized after clearing points.")
-        else:
-             print("Warning: Could not re-initialize state after clear_points (video_path missing).")
-             session_state["inference_state"] = None # Ensure state is None if video_path is gone
-    # Re-render the points_map with no points drawn (just the first frame)
-    # Re-render the output_image with no mask (just the first frame)
-    first_frame_img = session_state["first_frame"] if session_state["first_frame"] is not None else None
-    return (
-        first_frame_img, # points_map shows original first frame
-        None, # output_image cleared
-        gr.update(value=None, visible=False), # Hide output video
-        session_state, # Updated session state
-    )
-# Removed @spaces.GPU decorator
 def segment_with_points(
     point_type,
     session_state,
     evt: gr.SelectData,
 ):
     """Adds a point prompt and performs segmentation on the first frame."""
-    # Ensure we have a valid first frame and inference state
     if session_state["first_frame"] is None or session_state["inference_state"] is None:
          print("Error: Cannot segment. No video loaded or inference state missing.")
-         # Return current states to avoid errors, without changing UI much
          return (
-             session_state["first_frame"], # points_map remains unchanged
-             None, # output_image remains unchanged or cleared
              session_state,
          )
-    # evt.index gives the (x, y) coordinates of the click
     click_coords = evt.index
     print(f"Clicked at: {click_coords} ({point_type})")
@@ -314,12 +296,11 @@ def segment_with_points(
     for index, track in enumerate(session_state["input_points"]):
         # Ensure coordinates are integers for cv2.circle
         point_coords = (int(track[0]), int(track[1]))
         if session_state["input_labels"][index] == 1:
-            # Green circle for include
-            cv2.circle(transparent_layer_points, point_coords, radius, (0, 255, 0, 255), -1)
         else:
-            # Red circle for exclude
-            cv2.circle(transparent_layer_points, point_coords, radius, (255, 0, 0, 255), -1)
     # Convert the transparent layer back to an image and composite onto the first frame
     transparent_layer_points_pil = Image.fromarray(transparent_layer_points, "RGBA")
@@ -329,13 +310,14 @@ def segment_with_points(
         first_frame_pil.copy(), transparent_layer_points_pil
     )
-    # Prepare points and labels as tensors on CPU for the predictor
     points = np.array(session_state["input_points"], dtype=np.float32)
     labels = np.array(session_state["input_labels"], np.int32)
-    # Ensure tensors are on CPU
-    points_tensor = torch.tensor(points, dtype=torch.float32, device="cpu").unsqueeze(0) # Add batch dim
-    labels_tensor = torch.tensor(labels, dtype=torch.int32, device="cpu").unsqueeze(0) # Add batch dim
     # Add new points to the predictor's state and get the mask for the first frame
     # This call performs segmentation on the current frame (frame_idx=0) using all accumulated points
@@ -351,8 +333,9 @@ def segment_with_points(
         )
         # Process logits: detach from graph, move to CPU, apply threshold
-        # out_mask_logits is a list of tensors [tensor([H, W])] for the requested obj_id
-        mask_tensor = (out_mask_logits[0][0].detach().cpu() > 0.0) # Apply threshold and get the single mask tensor [H, W]
         mask_numpy = mask_tensor.numpy() # Convert to numpy
         # Get the mask image (RGBA)
@@ -366,6 +349,9 @@ def segment_with_points(
         print(f"Error during segmentation on first frame: {e}")
         # On error, first_frame_output_img remains None
     return selected_point_map_img, first_frame_output_img, session_state
@@ -416,21 +402,22 @@ def show_mask(mask, obj_id=None, random_color=False, convert_to_image=True):
         return colored_mask_uint8
-# Removed @spaces.GPU decorator
 def propagate_to_all(
-    # We don't strictly need video_in path here anymore as it's in session_state,
-    # but keeping it is fine. Accessing session_state["video_path"] is more robust.
-    video_in,
     session_state,
 ):
     """Runs mask propagation through the video and generates the output video."""
     print("Starting propagation...")
     # Ensure state is ready
     if (
         len(session_state["input_points"]) == 0 # Need at least one point
         or session_state["all_frames"] is None
         or session_state["inference_state"] is None
-        or session_state["video_path"] is None # Ensure we have the original video path
     ):
         print("Error: Cannot propagate. No points selected, video not loaded, or inference state missing.")
         return (
@@ -439,7 +426,6 @@ def propagate_to_all(
         )
     # run propagation throughout the video and collect the results
-    # The generator yields (frame_idx, obj_ids, mask_logits)
     video_segments = {}
     try:
         # This loop performs the core tracking prediction frame by frame
@@ -451,7 +437,7 @@ def propagate_to_all(
              video_segments[out_frame_idx] = {
                  # out_mask_logits is a list of tensors (one per object tracked in this frame)
                  # Each tensor is [batch_size, H, W]. Batch size is 1 here.
-                 # Access the first element of the batch [0]
                  out_obj_id: (out_mask_logits[i][0].detach().cpu() > 0.0).numpy()
                  for i, out_obj_id in enumerate(out_obj_ids)
              }
@@ -492,9 +478,11 @@ def propagate_to_all(
         output_frames.append(output_frame_np)
     # Define output path in a temporary directory
-    # Use os.path.join for cross-platform compatibility
     unique_id = datetime.now().strftime("%Y%m%d%H%M%S%f") # Use microseconds for more uniqueness
     final_vid_filename = f"output_video_{unique_id}.mp4"
     final_vid_output_path = os.path.join(tempfile.gettempdir(), final_vid_filename)
@@ -502,9 +490,8 @@ def propagate_to_all(
     # Create a video clip from the image sequence
-    # Get original FPS or default
-    # Get FPS from the stored video path in session state
-    original_fps = get_video_fps(session_state["video_path"])
     fps = original_fps if original_fps is not None and original_fps > 0 else 30 # Default to 30 if detection fails or is zero
     print(f"Creating output video with FPS: {fps}")
@@ -526,20 +513,11 @@ def propagate_to_all(
             session_state,
         )
     # Write the result to a file. Use 'libx264' codec for broad compatibility.
-    # `preset` and `threads` for CPU optimization.
-    # `logger=None` prevents moviepy from printing progress to stdout/stderr, which can clutter the Gradio logs.
     try:
-        print(f"Writing video file with codec='libx264', fps={fps}, preset='medium', threads='auto'")
-        clip.write_videofile(
-            final_vid_output_path,
-            codec="libx264",
-            fps=fps, # Ensure correct FPS is used during writing
-            preset="medium", # CPU optimization: 'fast', 'faster', 'veryfast' are options for speed vs size
-            threads="auto", # CPU optimization: Use multiple cores
-            logger=None # Suppress moviepy output
-        )
         print("Video writing complete.")
         # Return the path and make the video player visible
         return (
@@ -563,7 +541,7 @@ def propagate_to_all(
         )
-def update_output_video_visibility():
     """Simply returns a Gradio update to make the output video visible."""
     return gr.update(visible=True)
@@ -611,12 +589,11 @@ with gr.Blocks() as demo:
                 points_map = gr.Image(
                     label="Click on the First Frame to Add Points", # Clearer label
                     type="numpy",
-                    interactive=True, # Make interactive to capture clicks
                     height=400, # Set a fixed height for better UI
                     width="auto", # Let width adjust
                     show_share_button=False,
                     show_download_button=False,
-                    # show_label=False # Can hide label if space is tight
                 )
             with gr.Column():
@@ -627,7 +604,7 @@ with gr.Blocks() as demo:
                     examples_per_page=8,
                     cache_examples=False, # Do not cache processed examples, as state is involved
                 )
-                # Add padding/space
                 # gr.Markdown("<br>")
                 # output_image shows the segmentation mask prediction on the *first* frame
@@ -639,7 +616,6 @@ with gr.Blocks() as demo:
                     width="auto", # Let width adjust
                     show_share_button=False,
                     show_download_button=False,
-                    # show_label=False # Can hide label
                 )
                 # output_video shows the final tracking result
@@ -649,35 +625,25 @@ with gr.Blocks() as demo:
     # --- Event Handlers ---
     # When a new video file is uploaded via the file browser
     video_in.upload(
         fn=preprocess_video_in,
         inputs=[video_in, session_state],
         outputs=[
-            video_in_drawer, # Close accordion
-            points_map,      # Show first frame in points_map
-            output_image,    # Clear output image
-            output_video,    # Hide output video
-            propagate_btn,   # Enable Track button
-            clear_points_btn,# Enable Clear Points button
-            reset_btn,       # Enable Reset button
-            session_state,   # Update session state
         ],
         queue=False, # Process immediately
     )
     # When an example video is selected (change event)
     video_in.change(
         fn=preprocess_video_in,
         inputs=[video_in, session_state],
-         outputs=[
-            video_in_drawer, # Close accordion
-            points_map,      # Show first frame in points_map
-            output_image,    # Clear output image
-            output_video,    # Hide output video
-            propagate_btn,   # Enable Track button
-            clear_points_btn,# Enable Clear Points button
-            reset_btn,       # Enable Reset button
-            session_state,   # Update session state
         ],
         queue=False, # Process immediately
     )
@@ -716,15 +682,8 @@ with gr.Blocks() as demo:
         fn=reset,
         inputs=[session_state], # Pass session state
         outputs=[
-            video_in,        # Clear video input
-            video_in_drawer, # Open video accordion
-            points_map,      # Clear points_map
-            output_image,    # Clear output_image
-            output_video,    # Hide output_video
-            propagate_btn,   # Disable buttons
-            clear_points_btn,# Disable buttons
-            reset_btn,       # Disable buttons
-            session_state,   # Reset session state
         ],
         queue=False, # Process immediately
     )
@@ -743,18 +702,16 @@ with gr.Blocks() as demo:
         ],
         outputs=[
             output_video,  # Update output video player with result
-            session_state, # Update session state (currently, propagate doesn't modify state much, but good practice)
         ],
-        # CPU Optimization: Limit concurrency to 1 to prevent resource exhaustion.
-        # Queue=True ensures requests wait if another is processing.
-        concurrency_limit=1,
-        queue=True,
     )
 # Launch the Gradio demo
-demo.queue() # Enable queuing for sequential processing under concurrency limits
 print("Gradio demo starting...")
-# Removed share=True for local debugging unless you specifically need a public link
 demo.launch()
 print("Gradio demo launched.")

 import gradio as gr
+# This line might be related to GPU, kept from original
+os.environ["TORCH_CUDNN_SDPA_ENABLED"] = "0,1,2,3,4,5,6,7"
 import tempfile
 import cv2
 import matplotlib.pyplot as plt
+# spaces import and decorators are for Hugging Face Spaces GPU allocation,
+# if running locally without spaces, these can be removed or will be ignored.
+import spaces
 import numpy as np
 import torch
 from moviepy.editor import ImageSequenceClip
                 </ol>
               """
+# examples
 examples = [
     ["examples/01_dog.mp4"],
     ["examples/02_cups.mp4"],
 sam2_checkpoint = "checkpoints/edgetam.pt"
 model_cfg = "edgetam.yaml"
+# Model built for CPU but immediately moved to CUDA in original code
 predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
+# *** Original code moves to CUDA ***
+predictor.to("cuda")
+print("predictor loaded on CUDA")
+# use bfloat16 for the entire demo - Original code uses CUDA bfloat16
+torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
+# Original CUDA settings
+if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8:
+    # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+elif not torch.cuda.is_available():
+    print("Warning: CUDA not available. The original code is configured for GPU.")
+    # Note: Without a GPU, the .to("cuda") calls will likely cause errors.
 def get_video_fps(video_path):
+    # Open the video file
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
+        print("Error: Could not open video.")
         return None
+    # Get the FPS of the video
     fps = cap.get(cv2.CAP_PROP_FPS)
+    cap.release() # Release the capture object
     return fps
+def reset(session_state):
+    """Resets the UI and session state."""
+    print("Resetting demo.")
+    session_state["input_points"] = []
+    session_state["input_labels"] = []
+    # Reset the predictor state if it exists
+    if session_state["inference_state"] is not None:
+        # Assuming predictor.reset_state handles None or invalid states gracefully
+        # Or you might need to explicitly pass the state object if required
+        try:
+            predictor.reset_state(session_state["inference_state"])
+             # Explicitly delete or re-init the state object if a full reset is intended
+             # This depends on how predictor.reset_state works
+             # session_state["inference_state"] = None # Example if reset_state doesn't fully clear
+        except Exception as e:
+             print(f"Error resetting predictor state: {e}")
+             # If reset fails, perhaps force-clear the state object
+             session_state["inference_state"] = None
+    session_state["first_frame"] = None
+    session_state["all_frames"] = None
+    session_state["inference_state"] = None # Ensure state is None after a full reset
+    # Also reset video path if stored
+    session_state["video_path"] = None
+    # Resetting UI components
+    return (
+        None, # video_in (clears the video player)
+        gr.update(open=True), # video_in_drawer (opens accordion)
+        None, # points_map (clears the image)
+        None, # output_image (clears the image)
+        gr.update(value=None, visible=False), # output_video (hides and clears)
+        session_state, # return updated session state
+    )
+def clear_points(session_state):
+    """Clears selected points and resets segmentation on the first frame."""
+    print("Clearing points.")
+    session_state["input_points"] = []
+    session_state["input_labels"] = []
+    # Reset the predictor state to clear internal masks/features
+    # This typically doesn't remove the video context, just the mask predictions
+    if session_state["inference_state"] is not None:
+        try:
+            # Assuming reset_state handles clearing current masks/features
+            predictor.reset_state(session_state["inference_state"])
+            print("Predictor state reset for clearing points.")
+            # If you need to re-initialize the state for the *same* video after clearing points,
+            # you might need to call predictor.init_state again here, using the stored video_path.
+            # session_state["inference_state"] = predictor.init_state(video_path=session_state["video_path"], device="cuda") # Or device="cpu" if modified earlier
+        except Exception as e:
+             print(f"Error resetting predictor state during clear_points: {e}")
+             # If reset fails, this might leave old masks. Depending on SAM2's behavior,
+             # you might need a more aggressive state clear or re-initialization.
+    # Return the original first frame image for points_map and clear the output_image
+    first_frame_img = session_state["first_frame"] if session_state["first_frame"] is not None else None
+    return (
+        first_frame_img, # points_map shows original first frame (no points yet)
+        None, # output_image cleared (no mask)
+        gr.update(value=None, visible=False), # output_video hidden
+        session_state, # return updated session state
+    )
+# Added @spaces.GPU decorator back as it was in the original code
+@spaces.GPU
 def preprocess_video_in(video_path, session_state):
     """Loads video frames and initializes the predictor state."""
     print(f"Processing video: {video_path}")
     if video_path is None or not os.path.exists(video_path):
         print("No video path provided or file not found.")
         # Reset state and UI elements if input is invalid
+        # Need to return updates for the buttons as well
         return (
+            gr.update(open=True), None, None, gr.update(value=None, visible=False),
+            gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False),
+             { # Reset session state
+                "first_frame": None, "all_frames": None, "input_points": [],
+                "input_labels": [], "inference_state": None, "video_path": None,
             }
         )
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
         print(f"Error: Could not open video file {video_path}.")
         return (
+            gr.update(open=True), None, None, gr.update(value=None, visible=False),
+            gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False),
             { # Reset session state
+                "first_frame": None, "all_frames": None, "input_points": [],
+                "input_labels": [], "inference_state": None, "video_path": None,
             }
         )
         ret, frame = cap.read()
         if not ret:
             break
         frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
         all_frames.append(frame)
         if first_frame is None:
+            first_frame = frame
     cap.release()
     if not all_frames:
         print(f"Error: No frames read from video file {video_path}.")
         return (
+            gr.update(open=True), None, None, gr.update(value=None, visible=False),
+            gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False),
             { # Reset session state
+                "first_frame": None, "all_frames": None, "input_points": [],
+                "input_labels": [], "inference_state": None, "video_path": None,
             }
         )
+    session_state["first_frame"] = copy.deepcopy(first_frame)
     session_state["all_frames"] = all_frames
+    session_state["video_path"] = video_path # Store video path
     session_state["input_points"] = []
     session_state["input_labels"] = []
+    # Original code did NOT pass device here. It uses the device the predictor is on.
     session_state["inference_state"] = predictor.init_state(video_path=video_path)
     print("Video loaded and predictor state initialized.")
+    # Enable buttons after successful load
     return [
         gr.update(open=False),  # video_in_drawer
+        first_frame,  # points_map
+        None,  # output_image
+        gr.update(value=None, visible=False),  # output_video
+        gr.update(interactive=True), # propagate_btn
+        gr.update(interactive=True), # clear_points_btn
+        gr.update(interactive=True), # reset_btn
+        session_state, # session_state
     ]
+# Added @spaces.GPU decorator back as it was in the original code
+@spaces.GPU
 def segment_with_points(
     point_type,
     session_state,
     evt: gr.SelectData,
 ):
     """Adds a point prompt and performs segmentation on the first frame."""
+    # Ensure we have state and first frame
     if session_state["first_frame"] is None or session_state["inference_state"] is None:
          print("Error: Cannot segment. No video loaded or inference state missing.")
+         # Return current images and state without changes
          return (
+             session_state.get("first_frame"), # points_map (show first frame if exists)
+             None, # output_image (keep cleared)
              session_state,
          )
+    # evt.index is the (x, y) coordinate tuple
     click_coords = evt.index
     print(f"Clicked at: {click_coords} ({point_type})")
     for index, track in enumerate(session_state["input_points"]):
         # Ensure coordinates are integers for cv2.circle
         point_coords = (int(track[0]), int(track[1]))
+        # Ensure color is RGBA (0-255)
         if session_state["input_labels"][index] == 1:
+            cv2.circle(transparent_layer_points, point_coords, radius, (0, 255, 0, 255), -1) # Green for include
         else:
+            cv2.circle(transparent_layer_points, point_coords, radius, (255, 0, 0, 255), -1) # Red for exclude
     # Convert the transparent layer back to an image and composite onto the first frame
     transparent_layer_points_pil = Image.fromarray(transparent_layer_points, "RGBA")
         first_frame_pil.copy(), transparent_layer_points_pil
     )
+    # Prepare points and labels as tensors on the correct device (CUDA in original code)
     points = np.array(session_state["input_points"], dtype=np.float32)
     labels = np.array(session_state["input_labels"], np.int32)
+    # Ensure tensors are on the correct device (CUDA as per original code setup)
+    device = next(predictor.parameters()).device # Get the device the model is on
+    points_tensor = torch.tensor(points, dtype=torch.float32, device=device).unsqueeze(0) # Add batch dim
+    labels_tensor = torch.tensor(labels, dtype=torch.int32, device=device).unsqueeze(0) # Add batch dim
     # Add new points to the predictor's state and get the mask for the first frame
     # This call performs segmentation on the current frame (frame_idx=0) using all accumulated points
         )
         # Process logits: detach from graph, move to CPU, apply threshold
+        # out_mask_logits is a list of tensors [tensor([batch_size, H, W])] for the requested obj_id
+        # Access the result for the first object (index 0) and the first item in batch (index 0)
+        mask_tensor = (out_mask_logits[0][0].detach().cpu() > 0.0) # Move to CPU before converting to numpy
         mask_numpy = mask_tensor.numpy() # Convert to numpy
         # Get the mask image (RGBA)
         print(f"Error during segmentation on first frame: {e}")
         # On error, first_frame_output_img remains None
+    # Original code clears CUDA cache here
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
     return selected_point_map_img, first_frame_output_img, session_state
         return colored_mask_uint8
+# Added @spaces.GPU decorator back as it was in the original code
+@spaces.GPU
 def propagate_to_all(
+    video_in, # Keep video_in path as in original
     session_state,
 ):
     """Runs mask propagation through the video and generates the output video."""
     print("Starting propagation...")
     # Ensure state is ready
+    # Using session_state.get("video_path") is safer than video_in directly
+    current_video_path = session_state.get("video_path")
     if (
         len(session_state["input_points"]) == 0 # Need at least one point
         or session_state["all_frames"] is None
         or session_state["inference_state"] is None
+        or current_video_path is None # Ensure we have the original video path
     ):
         print("Error: Cannot propagate. No points selected, video not loaded, or inference state missing.")
         return (
         )
     # run propagation throughout the video and collect the results
     video_segments = {}
     try:
         # This loop performs the core tracking prediction frame by frame
              video_segments[out_frame_idx] = {
                  # out_mask_logits is a list of tensors (one per object tracked in this frame)
                  # Each tensor is [batch_size, H, W]. Batch size is 1 here.
+                 # Access the result for the first object (index i) and the first item in batch (index 0)
                  out_obj_id: (out_mask_logits[i][0].detach().cpu() > 0.0).numpy()
                  for i, out_obj_id in enumerate(out_obj_ids)
              }
         output_frames.append(output_frame_np)
+    # Original code clears CUDA cache here
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
     # Define output path in a temporary directory
     unique_id = datetime.now().strftime("%Y%m%d%H%M%S%f") # Use microseconds for more uniqueness
     final_vid_filename = f"output_video_{unique_id}.mp4"
     final_vid_output_path = os.path.join(tempfile.gettempdir(), final_vid_filename)
     # Create a video clip from the image sequence
+    # Get original FPS from the stored video path
+    original_fps = get_video_fps(current_video_path)
     fps = original_fps if original_fps is not None and original_fps > 0 else 30 # Default to 30 if detection fails or is zero
     print(f"Creating output video with FPS: {fps}")
             session_state,
         )
     # Write the result to a file. Use 'libx264' codec for broad compatibility.
     try:
+        print(f"Writing video file with codec='libx264', fps={fps}")
+        # Added basic moviepy writing parameters back, similar to original intent
+        clip.write_videofile(final_vid_output_path, codec="libx264", fps=fps)
         print("Video writing complete.")
         # Return the path and make the video player visible
         return (
         )
+def update_ui():
     """Simply returns a Gradio update to make the output video visible."""
     return gr.update(visible=True)
                 points_map = gr.Image(
                     label="Click on the First Frame to Add Points", # Clearer label
                     type="numpy",
+                    interactive=True, # <--- THIS WAS CHANGED FROM False TO True
                     height=400, # Set a fixed height for better UI
                     width="auto", # Let width adjust
                     show_share_button=False,
                     show_download_button=False,
                 )
             with gr.Column():
                     examples_per_page=8,
                     cache_examples=False, # Do not cache processed examples, as state is involved
                 )
+                # Add padding/space - removed extra lines as they take up a lot of space
                 # gr.Markdown("<br>")
                 # output_image shows the segmentation mask prediction on the *first* frame
                     width="auto", # Let width adjust
                     show_share_button=False,
                     show_download_button=False,
                 )
                 # output_video shows the final tracking result
     # --- Event Handlers ---
     # When a new video file is uploaded via the file browser
+    # Added postprocess to update button interactivity based on whether video loaded
     video_in.upload(
         fn=preprocess_video_in,
         inputs=[video_in, session_state],
         outputs=[
+            video_in_drawer, points_map, output_image, output_video,
+            propagate_btn, clear_points_btn, reset_btn, session_state,
         ],
         queue=False, # Process immediately
     )
     # When an example video is selected (change event)
+    # Added postprocess to update button interactivity
     video_in.change(
         fn=preprocess_video_in,
         inputs=[video_in, session_state],
+        outputs=[
+            video_in_drawer, points_map, output_image, output_video,
+            propagate_btn, clear_points_btn, reset_btn, session_state,
         ],
         queue=False, # Process immediately
     )
         fn=reset,
         inputs=[session_state], # Pass session state
         outputs=[
+            video_in, video_in_drawer, points_map, output_image, output_video,
+            propagate_btn, clear_points_btn, reset_btn, session_state,
         ],
         queue=False, # Process immediately
     )
         ],
         outputs=[
             output_video,  # Update output video player with result
+            session_state, # Update session state
         ],
+        # concurrency_limit from original code (may need adjustment based on your hardware/GPU)
+        concurrency_limit=10,
+        queue=False, # queue from original code
     )
 # Launch the Gradio demo
+demo.queue() # Enable queuing
 print("Gradio demo starting...")
 demo.launch()
 print("Gradio demo launched.")