Spaces:

AmberHeart
/

AetherV1

Running on Zero

App Files Files Community

Wenzheng Chang commited on Mar 31

Commit

d5d6d85

1 Parent(s): 159559c

init gradio

Browse files

Files changed (1) hide show

scripts/demo_gradio.py +760 -287

scripts/demo_gradio.py CHANGED Viewed

@@ -17,6 +17,7 @@ from diffusers import (
     CogVideoXTransformer3DModel,
 )
 from transformers import AutoTokenizer, T5EncoderModel
 rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
@@ -39,9 +40,6 @@ from aether.utils.postprocess_utils import (  # noqa: E402
 from aether.utils.visualize_utils import predictions_to_glb  # noqa: E402
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def seed_all(seed: int = 0) -> None:
     """
     Set random seeds of all components.
@@ -52,7 +50,7 @@ def seed_all(seed: int = 0) -> None:
     torch.cuda.manual_seed_all(seed)
-# Global pipeline
 cogvideox_pretrained_model_name_or_path: str = "THUDM/CogVideoX-5b-I2V"
 aether_pretrained_model_name_or_path: str = "AetherWorldModel/AetherV1"
 pipeline = AetherV1PipelineCogVideoX(
@@ -64,22 +62,45 @@ pipeline = AetherV1PipelineCogVideoX(
         cogvideox_pretrained_model_name_or_path, subfolder="text_encoder"
     ),
     vae=AutoencoderKLCogVideoX.from_pretrained(
-        cogvideox_pretrained_model_name_or_path, subfolder="vae"
     ),
     scheduler=CogVideoXDPMScheduler.from_pretrained(
         cogvideox_pretrained_model_name_or_path, subfolder="scheduler"
     ),
     transformer=CogVideoXTransformer3DModel.from_pretrained(
-        aether_pretrained_model_name_or_path, subfolder="transformer"
     ),
 )
 pipeline.vae.enable_slicing()
 pipeline.vae.enable_tiling()
-pipeline.to(device)
-def build_pipeline() -> AetherV1PipelineCogVideoX:
     """Initialize the model pipeline."""
     return pipeline
@@ -395,12 +416,29 @@ def save_output_files(
         for frame_idx in frames_to_save:
             if frame_idx >= pointmap.shape[0]:
                 continue
             predictions = {
-                "world_points": pointmap[frame_idx : frame_idx + 1],
                 "images": rgb[frame_idx : frame_idx + 1],
                 "depths": 1 / np.clip(disparity[frame_idx : frame_idx + 1], 1e-8, 1e8),
-                "camera_poses": poses[frame_idx : frame_idx + 1],
             }
             glb_path = os.path.join(
@@ -423,6 +461,7 @@ def save_output_files(
     return paths
 def process_reconstruction(
     video_file,
     height,
@@ -447,11 +486,13 @@ def process_reconstruction(
         gc.collect()
         torch.cuda.empty_cache()
-        # Set random seed
         seed_all(seed)
-        # Build the pipeline
-        pipeline = build_pipeline()
         progress(0.1, "Loading video")
         # Check if video_file is a string or a file object
@@ -545,6 +586,7 @@ def process_reconstruction(
         return None, None, []
 def process_prediction(
     image_file,
     height,
@@ -573,9 +615,14 @@ def process_prediction(
         # Set random seed
         seed_all(seed)
         # Build the pipeline
-        pipeline = build_pipeline()
         progress(0.1, "Loading image")
         # Check if image_file is a string or a file object
@@ -671,6 +718,7 @@ def process_prediction(
         return None, None, []
 def process_planning(
     image_file,
     goal_file,
@@ -700,8 +748,13 @@ def process_planning(
         # Set random seed
         seed_all(seed)
         # Build the pipeline
-        pipeline = build_pipeline()
         progress(0.1, "Loading images")
         # Check if image_file and goal_file are strings or file objects
@@ -807,11 +860,10 @@ def update_task_ui(task):
     """Update UI elements based on selected task."""
     if task == "reconstruction":
         return (
-            gr.update(visible=True),  # video_input
-            gr.update(visible=False),  # image_input
-            gr.update(visible=False),  # goal_input
-            gr.update(visible=False),  # image_preview
-            gr.update(visible=False),  # goal_preview
             gr.update(value=4),  # num_inference_steps
             gr.update(visible=True),  # sliding_window_stride
             gr.update(visible=False),  # use_dynamic_cfg
@@ -821,11 +873,10 @@ def update_task_ui(task):
         )
     elif task == "prediction":
         return (
-            gr.update(visible=False),  # video_input
-            gr.update(visible=True),  # image_input
-            gr.update(visible=False),  # goal_input
-            gr.update(visible=True),  # image_preview
-            gr.update(visible=False),  # goal_preview
             gr.update(value=50),  # num_inference_steps
             gr.update(visible=False),  # sliding_window_stride
             gr.update(visible=True),  # use_dynamic_cfg
@@ -835,11 +886,10 @@ def update_task_ui(task):
         )
     elif task == "planning":
         return (
-            gr.update(visible=False),  # video_input
-            gr.update(visible=True),  # image_input
-            gr.update(visible=True),  # goal_input
-            gr.update(visible=True),  # image_preview
-            gr.update(visible=True),  # goal_preview
             gr.update(value=50),  # num_inference_steps
             gr.update(visible=False),  # sliding_window_stride
             gr.update(visible=True),  # use_dynamic_cfg
@@ -851,16 +901,20 @@ def update_task_ui(task):
 def update_image_preview(image_file):
     """Update the image preview."""
-    if image_file:
-        return image_file.name
-    return None
 def update_goal_preview(goal_file):
     """Update the goal preview."""
-    if goal_file:
-        return goal_file.name
-    return None
 def get_download_link(selected_frame, all_paths):
@@ -892,8 +946,17 @@ with gr.Blocks(
         min-height: 400px;
     }
     .warning {
-        color: #ff9800;
-        font-weight: bold;
     }
     .highlight {
         background-color: rgba(0, 123, 255, 0.1);
@@ -903,9 +966,9 @@ with gr.Blocks(
         margin: 10px 0;
     }
     .task-header {
-        margin-top: 10px;
-        margin-bottom: 15px;
-        font-size: 1.2em;
         font-weight: bold;
         color: #007bff;
     }
@@ -922,9 +985,9 @@ with gr.Blocks(
     }
     .input-section, .params-section, .advanced-section {
         border: 1px solid #ddd;
-        padding: 15px;
         border-radius: 8px;
-        margin-bottom: 15px;
     }
     .logo-container {
         display: flex;
@@ -935,288 +998,703 @@ with gr.Blocks(
         max-width: 300px;
         height: auto;
     }
 """,
 ) as demo:
-    with gr.Row(elem_classes=["logo-container"]):
-        gr.Image("assets/logo.png", show_label=False, elem_classes=["logo-image"])
-    gr.Markdown(
-        """
-    # Aether: Geometric-Aware Unified World Modeling
-    Aether addresses a fundamental challenge in AI: integrating geometric reconstruction with
-    generative modeling for human-like spatial reasoning. Our framework unifies three core capabilities:
-    1. **4D dynamic reconstruction** - Reconstruct dynamic point clouds from videos by estimating depths and camera poses.
-    2. **Action-Conditioned Video Prediction** - Predict future frames based on initial observation images, with optional conditions of camera trajectory actions.
-    3. **Goal-Conditioned Visual Planning** - Generate planning paths from pairs of observation and goal images.
-    Trained entirely on synthetic data, Aether achieves strong zero-shot generalization to real-world scenarios.
-    """
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
-            task = gr.Radio(
-                ["reconstruction", "prediction", "planning"],
-                label="Select Task",
-                value="reconstruction",
-                info="Choose the task you want to perform",
-            )
-            with gr.Group(elem_classes=["input-section"]):
-                # Input section - changes based on task
-                gr.Markdown("## 📥 Input", elem_classes=["task-header"])
-                # Task-specific inputs
-                video_input = gr.Video(
-                    label="Upload Input Video",
-                    sources=["upload"],
-                    visible=True,
-                    interactive=True,
-                    elem_id="video_input",
                 )
-                image_input = gr.File(
-                    label="Upload Start Image",
-                    file_count="single",
-                    file_types=["image"],
-                    visible=False,
-                    interactive=True,
-                    elem_id="image_input",
                 )
-                goal_input = gr.File(
-                    label="Upload Goal Image",
-                    file_count="single",
-                    file_types=["image"],
-                    visible=False,
-                    interactive=True,
-                    elem_id="goal_input",
                 )
-                with gr.Row(visible=False) as preview_row:
-                    image_preview = gr.Image(
-                        label="Start Image Preview",
-                        elem_id="image_preview",
-                        visible=False,
-                    )
-                    goal_preview = gr.Image(
-                        label="Goal Image Preview",
-                        elem_id="goal_preview",
-                        visible=False,
                     )
-            with gr.Group(elem_classes=["params-section"]):
-                gr.Markdown("## ⚙️ Parameters", elem_classes=["task-header"])
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        height = gr.Dropdown(
-                            choices=[480],
-                            value=480,
-                            label="Height",
-                            info="Height of the output video",
                         )
-                    with gr.Column(scale=1):
-                        width = gr.Dropdown(
-                            choices=[720],
-                            value=720,
-                            label="Width",
-                            info="Width of the output video",
                         )
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        num_frames = gr.Dropdown(
-                            choices=[17, 25, 33, 41],
-                            value=41,
-                            label="Number of Frames",
-                            info="Number of frames to predict",
                         )
-                    with gr.Column(scale=1):
-                        fps = gr.Dropdown(
-                            choices=[8, 10, 12, 15, 24],
-                            value=12,
-                            label="FPS",
-                            info="Frames per second",
                         )
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        num_inference_steps = gr.Slider(
-                            minimum=1,
-                            maximum=60,
-                            value=4,
-                            step=1,
-                            label="Inference Steps",
-                            info="Number of inference step",
                         )
-                sliding_window_stride = gr.Slider(
-                    minimum=1,
-                    maximum=40,
-                    value=24,
-                    step=1,
-                    label="Sliding Window Stride",
-                    info="Sliding window stride (window size equals to num_frames). Only used for 'reconstruction' task",
-                    visible=True,
-                )
-                use_dynamic_cfg = gr.Checkbox(
-                    label="Use Dynamic CFG",
-                    value=True,
-                    info="Use dynamic CFG",
-                    visible=False,
-                )
-                raymap_option = gr.Radio(
-                    choices=["backward", "forward_right", "left_forward", "right"],
-                    label="Camera Movement Direction",
-                    value="forward_right",
-                    info="Direction of camera action. We offer 4 pre-defined actions for you to choose from.",
-                    visible=False,
-                )
-                post_reconstruction = gr.Checkbox(
-                    label="Post-Reconstruction",
-                    value=True,
-                    info="Run reconstruction after prediction for better quality",
-                    visible=False,
-                )
-            with gr.Accordion(
-                "Advanced Options", open=False, visible=True
-            ) as advanced_options:
-                with gr.Group(elem_classes=["advanced-section"]):
                     with gr.Row():
                         with gr.Column(scale=1):
-                            guidance_scale = gr.Slider(
-                                minimum=1.0,
-                                maximum=10.0,
-                                value=1.0,
-                                step=0.1,
-                                label="Guidance Scale",
-                                info="Guidance scale (only for prediction / planning)",
                             )
-                    with gr.Row():
                         with gr.Column(scale=1):
-                            seed = gr.Number(
-                                value=42,
-                                label="Random Seed",
-                                info="Set a seed for reproducible results",
-                                precision=0,
-                                minimum=0,
-                                maximum=2147483647,
                             )
                     with gr.Row():
                         with gr.Column(scale=1):
-                            smooth_camera = gr.Checkbox(
-                                label="Smooth Camera",
-                                value=True,
-                                info="Apply smoothing to camera trajectory",
                             )
                         with gr.Column(scale=1):
-                            align_pointmaps = gr.Checkbox(
-                                label="Align Point Maps",
-                                value=False,
-                                info="Align point maps across frames",
                             )
                     with gr.Row():
-                        with gr.Column(scale=1):
-                            max_depth = gr.Slider(
-                                minimum=10,
-                                maximum=200,
-                                value=60,
-                                step=10,
-                                label="Max Depth",
-                                info="Maximum depth for point cloud (higher = more distant points)",
-                            )
-                        with gr.Column(scale=1):
-                            rtol = gr.Slider(
-                                minimum=0.01,
-                                maximum=2.0,
-                                value=0.03,
-                                step=0.01,
-                                label="Relative Tolerance",
-                                info="Used for depth edge detection. Lower = remove more edges",
-                            )
-                    pointcloud_save_frame_interval = gr.Slider(
                         minimum=1,
-                        maximum=20,
-                        value=10,
                         step=1,
-                        label="Point Cloud Frame Interval",
-                        info="Save point cloud every N frames (higher = fewer files but less complete representation)",
                     )
-            run_button = gr.Button("Run Aether", variant="primary")
-        with gr.Column(scale=1, elem_classes=["output-column"]):
-            with gr.Group():
-                gr.Markdown("## 📤 Output", elem_classes=["task-header"])
-                gr.Markdown("### RGB Video", elem_classes=["output-subtitle"])
-                rgb_output = gr.Video(
-                    label="RGB Output", interactive=False, elem_id="rgb_output"
-                )
-                gr.Markdown("### Depth Video", elem_classes=["output-subtitle"])
-                depth_output = gr.Video(
-                    label="Depth Output", interactive=False, elem_id="depth_output"
-                )
-                gr.Markdown("### Point Clouds", elem_classes=["output-subtitle"])
-                with gr.Row(elem_classes=["flex-display"]):
-                    pointcloud_frames = gr.Dropdown(
-                        label="Select Frame",
-                        choices=[],
-                        value=None,
-                        interactive=True,
-                        elem_id="pointcloud_frames",
-                    )
-                    pointcloud_download = gr.DownloadButton(
-                        label="Download Point Cloud",
                         visible=False,
-                        elem_id="pointcloud_download",
                     )
-                model_output = gr.Model3D(
-                    label="Point Cloud Viewer", interactive=True, elem_id="model_output"
-                )
-                with gr.Tab("About Results"):
-                    gr.Markdown(
-                        """
-                    ### Understanding the Outputs
-                    - **RGB Video**: Shows the predicted or reconstructed RGB frames
-                    - **Depth Video**: Visualizes the disparity maps in color (closer = red, further = blue)
-                    - **Point Clouds**: Interactive 3D point cloud with camera positions shown as colored pyramids
-                    <p class="warning">Note: 3D point clouds take a long time to visualize, and we show the keyframes only.
-                    You can control the keyframe interval by modifying the `pointcloud_save_frame_interval`.</p>
-                    """
-                    )
     # Event handlers
     task.change(
         fn=update_task_ui,
         inputs=[task],
         outputs=[
-            video_input,
-            image_input,
-            goal_input,
-            image_preview,
-            goal_preview,
             num_inference_steps,
             sliding_window_stride,
             use_dynamic_cfg,
@@ -1227,11 +1705,15 @@ with gr.Blocks(
     )
     image_input.change(
-        fn=update_image_preview, inputs=[image_input], outputs=[image_preview]
     ).then(fn=lambda: gr.update(visible=True), inputs=[], outputs=[preview_row])
     goal_input.change(
-        fn=update_goal_preview, inputs=[goal_input], outputs=[goal_preview]
     ).then(fn=lambda: gr.update(visible=True), inputs=[], outputs=[preview_row])
     def update_pointcloud_frames(pointcloud_paths):
@@ -1453,17 +1935,8 @@ with gr.Blocks(
         outputs=[pointcloud_download],
     )
-    # Example Accordion
-    with gr.Accordion("Examples"):
-        gr.Markdown(
-            """
-        ### Examples will be added soon
-        Check back for example inputs for each task type.
-        """
-        )
     # Load the model at startup
-    demo.load(lambda: build_pipeline(), inputs=None, outputs=None)
 if __name__ == "__main__":
     os.environ["TOKENIZERS_PARALLELISM"] = "false"

     CogVideoXTransformer3DModel,
 )
 from transformers import AutoTokenizer, T5EncoderModel
+import spaces
 rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
 from aether.utils.visualize_utils import predictions_to_glb  # noqa: E402
 def seed_all(seed: int = 0) -> None:
     """
     Set random seeds of all components.
     torch.cuda.manual_seed_all(seed)
+# # Global pipeline
 cogvideox_pretrained_model_name_or_path: str = "THUDM/CogVideoX-5b-I2V"
 aether_pretrained_model_name_or_path: str = "AetherWorldModel/AetherV1"
 pipeline = AetherV1PipelineCogVideoX(
         cogvideox_pretrained_model_name_or_path, subfolder="text_encoder"
     ),
     vae=AutoencoderKLCogVideoX.from_pretrained(
+        cogvideox_pretrained_model_name_or_path, subfolder="vae", torch_dtype=torch.bfloat16
     ),
     scheduler=CogVideoXDPMScheduler.from_pretrained(
         cogvideox_pretrained_model_name_or_path, subfolder="scheduler"
     ),
     transformer=CogVideoXTransformer3DModel.from_pretrained(
+        aether_pretrained_model_name_or_path, subfolder="transformer", torch_dtype=torch.bfloat16
     ),
 )
 pipeline.vae.enable_slicing()
 pipeline.vae.enable_tiling()
+# pipeline.to(device)
+def build_pipeline(device: torch.device) -> AetherV1PipelineCogVideoX:
     """Initialize the model pipeline."""
+    # cogvideox_pretrained_model_name_or_path: str = "THUDM/CogVideoX-5b-I2V"
+    # aether_pretrained_model_name_or_path: str = "AetherWorldModel/AetherV1"
+    # pipeline = AetherV1PipelineCogVideoX(
+    #     tokenizer=AutoTokenizer.from_pretrained(
+    #         cogvideox_pretrained_model_name_or_path,
+    #         subfolder="tokenizer",
+    #     ),
+    #     text_encoder=T5EncoderModel.from_pretrained(
+    #         cogvideox_pretrained_model_name_or_path, subfolder="text_encoder"
+    #     ),
+    #     vae=AutoencoderKLCogVideoX.from_pretrained(
+    #         cogvideox_pretrained_model_name_or_path, subfolder="vae"
+    #     ),
+    #     scheduler=CogVideoXDPMScheduler.from_pretrained(
+    #         cogvideox_pretrained_model_name_or_path, subfolder="scheduler"
+    #     ),
+    #     transformer=CogVideoXTransformer3DModel.from_pretrained(
+    #         aether_pretrained_model_name_or_path, subfolder="transformer"
+    #     ),
+    # )
+    # pipeline.vae.enable_slicing()
+    # pipeline.vae.enable_tiling()
+    pipeline.to(device)
     return pipeline
         for frame_idx in frames_to_save:
             if frame_idx >= pointmap.shape[0]:
                 continue
+            # fix the problem of point cloud being upside down and left-right reversed: flip Y axis and X axis
+            flipped_pointmap = pointmap[frame_idx:frame_idx+1].copy()
+            flipped_pointmap[..., 1] = -flipped_pointmap[..., 1]  # flip Y axis (up and down)
+            flipped_pointmap[..., 0] = -flipped_pointmap[..., 0]  # flip X axis (left and right)
+            # flip camera poses
+            flipped_poses = poses[frame_idx:frame_idx+1].copy()
+            # flip Y axis and X axis of camera orientation
+            flipped_poses[..., 1, :3] = -flipped_poses[..., 1, :3]  # flip Y axis of camera orientation
+            flipped_poses[..., 0, :3] = -flipped_poses[..., 0, :3]  # flip X axis of camera orientation
+            flipped_poses[..., :3, 1] = -flipped_poses[..., :3, 1]  # flip Y axis of camera orientation
+            flipped_poses[..., :3, 0] = -flipped_poses[..., :3, 0]  # flip X axis of camera orientation
+            # flip Y axis and X axis of camera position
+            flipped_poses[..., 1, 3] = -flipped_poses[..., 1, 3]  # flip Y axis position
+            flipped_poses[..., 0, 3] = -flipped_poses[..., 0, 3]  # flip X axis position
+            # use flipped point cloud and camera poses
             predictions = {
+                "world_points": flipped_pointmap,
                 "images": rgb[frame_idx : frame_idx + 1],
                 "depths": 1 / np.clip(disparity[frame_idx : frame_idx + 1], 1e-8, 1e8),
+                "camera_poses": flipped_poses,
             }
             glb_path = os.path.join(
     return paths
+@spaces.GPU(duration=300)
 def process_reconstruction(
     video_file,
     height,
         gc.collect()
         torch.cuda.empty_cache()
         seed_all(seed)
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if not torch.cuda.is_available():
+            raise ValueError("CUDA is not available. Check your environment.")
+        pipeline = build_pipeline(device)
         progress(0.1, "Loading video")
         # Check if video_file is a string or a file object
         return None, None, []
+@spaces.GPU(duration=300)
 def process_prediction(
     image_file,
     height,
         # Set random seed
         seed_all(seed)
+        # Check if CUDA is available
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if not torch.cuda.is_available():
+            raise ValueError("CUDA is not available. Check your environment.")
         # Build the pipeline
+        pipeline = build_pipeline(device)
         progress(0.1, "Loading image")
         # Check if image_file is a string or a file object
         return None, None, []
+@spaces.GPU(duration=300)
 def process_planning(
     image_file,
     goal_file,
         # Set random seed
         seed_all(seed)
+        # Check if CUDA is available
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if not torch.cuda.is_available():
+            raise ValueError("CUDA is not available. Check your environment.")
         # Build the pipeline
+        pipeline = build_pipeline(device)
         progress(0.1, "Loading images")
         # Check if image_file and goal_file are strings or file objects
     """Update UI elements based on selected task."""
     if task == "reconstruction":
         return (
+            gr.update(visible=True),  # reconstruction_group
+            gr.update(visible=False),  # prediction_group
+            gr.update(visible=False),  # planning_group
+            gr.update(visible=False),  # preview_row
             gr.update(value=4),  # num_inference_steps
             gr.update(visible=True),  # sliding_window_stride
             gr.update(visible=False),  # use_dynamic_cfg
         )
     elif task == "prediction":
         return (
+            gr.update(visible=False),  # reconstruction_group
+            gr.update(visible=True),  # prediction_group
+            gr.update(visible=False),  # planning_group
+            gr.update(visible=True),  # preview_row
             gr.update(value=50),  # num_inference_steps
             gr.update(visible=False),  # sliding_window_stride
             gr.update(visible=True),  # use_dynamic_cfg
         )
     elif task == "planning":
         return (
+            gr.update(visible=False),  # reconstruction_group
+            gr.update(visible=False),  # prediction_group
+            gr.update(visible=True),  # planning_group
+            gr.update(visible=True),  # preview_row
             gr.update(value=50),  # num_inference_steps
             gr.update(visible=False),  # sliding_window_stride
             gr.update(visible=True),  # use_dynamic_cfg
 def update_image_preview(image_file):
     """Update the image preview."""
+    if image_file is None:
+        return None
+    if isinstance(image_file, str):
+        return image_file
+    return image_file.name if hasattr(image_file, 'name') else None
 def update_goal_preview(goal_file):
     """Update the goal preview."""
+    if goal_file is None:
+        return None
+    if isinstance(goal_file, str):
+        return goal_file
+    return goal_file.name if hasattr(goal_file, 'name') else None
 def get_download_link(selected_frame, all_paths):
         min-height: 400px;
     }
     .warning {
+        color: #856404 !important;
+        font-weight: bold !important;
+        padding: 10px !important;
+        background-color: #fff3cd !important;
+        border-left: 4px solid #ffc107 !important;
+        border-radius: 4px !important;
+        margin: 10px 0 !important;
+    }
+    .dark .warning {
+        background-color: rgba(255, 193, 7, 0.1) !important;
+        color: #fbd38d !important;
     }
     .highlight {
         background-color: rgba(0, 123, 255, 0.1);
         margin: 10px 0;
     }
     .task-header {
+        margin-top: 15px;
+        margin-bottom: 20px;
+        font-size: 1.4em;
         font-weight: bold;
         color: #007bff;
     }
     }
     .input-section, .params-section, .advanced-section {
         border: 1px solid #ddd;
+        padding: 20px;
         border-radius: 8px;
+        margin-bottom: 20px;
     }
     .logo-container {
         display: flex;
         max-width: 300px;
         height: auto;
     }
+    /* Optimize layout and spacing */
+    .container {
+        margin: 0 auto;
+        padding: 0 15px;
+        max-width: 1800px;
+    }
+    .header {
+        text-align: center;
+        margin-bottom: 20px;
+        padding: 15px;
+        background: linear-gradient(to right, #f8f9fa, #e9ecef);
+        border-radius: 10px;
+    }
+    .dark .header {
+        background: linear-gradient(to right, #2d3748, #1a202c);
+    }
+    .main-title {
+        font-size: 2.2em;
+        font-weight: bold;
+        margin: 0 auto;
+        color: #2c3e50;
+        max-width: 800px;
+    }
+    .dark .main-title {
+        color: #e2e8f0;
+    }
+    .links-bar {
+        display: flex;
+        justify-content: center;
+        gap: 15px;
+        margin: 12px 0;
+    }
+    .link-button {
+        display: inline-flex;
+        align-items: center;
+        padding: 6px 12px;
+        background-color: #007bff;
+        color: white !important;
+        text-decoration: none;
+        border-radius: 5px;
+        transition: background-color 0.3s;
+        font-size: 0.95em;
+    }
+    .link-button:hover {
+        background-color: #0056b3;
+        text-decoration: none;
+    }
+    .features-limitations-container {
+        display: flex;
+        gap: 15px;
+        margin: 20px 0;
+    }
+    .capabilities-box, .limitations-box {
+        flex: 1;
+        padding: 18px;
+        border-radius: 8px;
+        margin-bottom: 15px;
+    }
+    .capabilities-box {
+        background: #f0f9ff;
+        border-left: 5px solid #3498db;
+    }
+    .dark .capabilities-box {
+        background: #172a3a;
+        border-left: 5px solid #3498db;
+    }
+    .limitations-box {
+        background: #f8f9fa;
+        border-left: 5px solid #ffc107;
+    }
+    .dark .limitations-box {
+        background: #2d2a20;
+        border-left: 5px solid #ffc107;
+    }
+    .capabilities-text, .limitations-text {
+        color: #495057;
+        line-height: 1.6;
+    }
+    .dark .capabilities-text, .dark .limitations-text {
+        color: #cbd5e0;
+    }
+    .capabilities-text h3 {
+        color: #2980b9;
+        margin-top: 0;
+        margin-bottom: 15px;
+    }
+    .dark .capabilities-text h3 {
+        color: #63b3ed;
+    }
+    .limitations-text h3 {
+        color: #d39e00;
+        margin-top: 0;
+        margin-bottom: 15px;
+    }
+    .dark .limitations-text h3 {
+        color: #fbd38d;
+    }
+    .capabilities-text blockquote, .limitations-text blockquote {
+        margin: 20px 0 0 0;
+        padding: 10px 20px;
+        font-style: italic;
+    }
+    .capabilities-text blockquote {
+        border-left: 3px solid #3498db;
+        background: rgba(52, 152, 219, 0.1);
+    }
+    .dark .capabilities-text blockquote {
+        background: rgba(52, 152, 219, 0.2);
+    }
+    .limitations-text blockquote {
+        border-left: 3px solid #ffc107;
+        background: rgba(255, 193, 7, 0.1);
+    }
+    .dark .limitations-text blockquote {
+        background: rgba(255, 193, 7, 0.2);
+    }
+    /* Optimize layout and spacing */
+    .main-interface {
+        display: flex;
+        gap: 30px;
+        margin-top: 20px;
+    }
+    .input-column, .output-column {
+        flex: 1;
+        min-width: 0;
+        display: flex;
+        flex-direction: column;
+    }
+    .output-panel {
+        border: 1px solid #ddd;
+        border-radius: 8px;
+        padding: 20px;
+        height: 100%;
+        display: flex;
+        flex-direction: column;
+        overflow-y: auto;
+    }
+    .dark .output-panel {
+        border-color: #4a5568;
+    }
+    .run-button-container {
+        display: flex;
+        justify-content: center;
+        margin: 15px 0;
+    }
+    .run-button {
+        padding: 10px 30px;
+        font-size: 1.1em;
+        font-weight: bold;
+        background: linear-gradient(to right, #3498db, #2980b9);
+        border: none;
+        border-radius: 5px;
+        color: white;
+        cursor: pointer;
+        transition: all 0.3s;
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    }
+    .run-button:hover {
+        background: linear-gradient(to right, #2980b9, #1a5276);
+        box-shadow: 0 6px 8px rgba(0, 0, 0, 0.15);
+        transform: translateY(-2px);
+    }
+    .task-selector {
+        background-color: #f8f9fa;
+        padding: 12px;
+        border-radius: 8px;
+        margin-bottom: 15px;
+        border: 1px solid #e9ecef;
+    }
+    .dark .task-selector {
+        background-color: #2d3748;
+        border-color: #4a5568;
+    }
+    /* Compact parameter settings */
+    .compact-params .row {
+        margin-bottom: 8px;
+    }
+    .compact-params label {
+        margin-bottom: 4px;
+    }
+    /* More obvious advanced options */
+    .advanced-options-header {
+        background-color: #e9ecef;
+        padding: 10px 15px;
+        border-radius: 6px;
+        margin-top: 10px;
+        font-weight: bold;
+        color: #495057;
+        border-left: 4px solid #6c757d;
+        cursor: pointer;
+        transition: all 0.2s;
+    }
+    .advanced-options-header:hover {
+        background-color: #dee2e6;
+    }
+    .dark .advanced-options-header {
+        background-color: #2d3748;
+        color: #e2e8f0;
+        border-left: 4px solid #a0aec0;
+    }
+    .dark .advanced-options-header:hover {
+        background-color: #4a5568;
+    }
+    /* Vertical arrangement of output section */
+    .output-section {
+        margin-bottom: 30px;
+        border: 1px solid #e9ecef;
+        border-radius: 8px;
+        padding: 20px;
+    }
+    .output-section-title {
+        font-weight: bold;
+        color: #495057;
+        margin-bottom: 15px;
+        font-size: 1.2em;
+    }
+    .dark .output-section-title {
+        color: #e2e8f0;
+    }
+    .pointcloud-controls {
+        display: flex;
+        gap: 10px;
+        margin-bottom: 10px;
+        align-items: center;
+    }
+    .note-box {
+        background-color: #fff8e1 !important;
+        border-left: 4px solid #ffc107 !important;
+        padding: 12px !important;
+        margin: 15px 0 !important;
+        border-radius: 4px !important;
+        color: #333 !important;
+    }
+    .dark .note-box {
+        background-color: rgba(255, 193, 7, 0.1) !important;
+        color: #e0e0e0 !important;
+    }
+    .note-box p, .note-box strong {
+        color: inherit !important;
+    }
+    /* Ensure warning class styles are correctly applied */
+    .warning {
+        color: #856404 !important;
+        font-weight: bold !important;
+        padding: 10px !important;
+        background-color: #fff3cd !important;
+        border-left: 4px solid #ffc107 !important;
+        border-radius: 4px !important;
+        margin: 10px 0 !important;
+    }
+    .dark .warning {
+        background-color: rgba(255, 193, 7, 0.1) !important;
+        color: #fbd38d !important;
+    }
+    .warning-box {
+        background-color: #fff3cd;
+        border-left: 4px solid #ffc107;
+        padding: 12px;
+        margin: 15px 0;
+        border-radius: 4px;
+        color: #856404;
+    }
+    .dark .warning-box {
+        background-color: rgba(255, 193, 7, 0.1);
+        color: #fbd38d;
+    }
 """,
 ) as demo:
+    with gr.Column(elem_classes=["container"]):
+        with gr.Row(elem_classes=["header"]):
+            with gr.Column():
+                gr.Markdown(
+                    """
+                    # Aether: Geometric-Aware Unified World Modeling
+                    """,
+                    elem_classes=["main-title"]
                 )
+                gr.Markdown(
+                    """
+                    <div class="links-bar">
+                        🌐<a href="https://aether-world.github.io/" class="link-button" target="_blank"> Project Page</a>
+                        📄<a href="https://arxiv.org/abs/2503.18945" class="link-button" target="_blank"> Paper</a>
+                        💻<a href="https://github.com/OpenRobotLab/Aether" class="link-button" target="_blank"> Code</a>
+                        🤗<a href="https://huggingface.co/AetherWorldModel/AetherV1" class="link-button" target="_blank"> Model</a>
+                    </div>
+                    """,
                 )
+        with gr.Row(elem_classes=["features-limitations-container"]):
+            with gr.Column(elem_classes=["capabilities-box"]):
+                gr.Markdown(
+                    """
+                    ### 🚀 Key Capabilities
+                    Aether addresses a fundamental challenge in AI: integrating geometric reconstruction with generative modeling for human-like spatial reasoning. Our framework unifies three core capabilities:
+                    - 🌏 **4D Dynamic Reconstruction**: Reconstruct dynamic point clouds from videos by estimating depths and camera poses.
+                    - 🎬 **Action-Conditioned Prediction**: Predict future frames based on initial observations, with optional camera trajectory actions.
+                    - 🎯 **Goal-Conditioned Planning**: Generate planning paths from pairs of observation and goal images.
+                    > *Trained entirely on synthetic data, Aether achieves strong zero-shot generalization to real-world scenarios.*
+                    """,
+                    elem_classes=["capabilities-text"]
+                )
+            with gr.Column(elem_classes=["limitations-box"]):
+                gr.Markdown(
+                    """
+                    ### 📝 Current Limitations
+                    Aether represents an initial step in our journey, trained entirely on synthetic data. While it demonstrates promising capabilities, it is important to be aware of its current limitations:
+                    - 🔄 **Dynamic Scenarios**: Struggles with highly dynamic scenarios involving significant motion or dense crowds.
+                    - 📸 **Camera Stability**: Camera pose estimation can be less stable in certain conditions.
+                    - 📐 **Planning Range**: For visual planning tasks, we recommend keeping the observations and goals relatively close to ensure optimal performance.
+                    > *We are actively working on the next generation of Aether and are committed to addressing these limitations in future releases.*
+                    """,
+                    elem_classes=["limitations-text"]
                 )
+        with gr.Row(elem_classes=["main-interface"]):
+            with gr.Column(elem_classes=["input-column"]):
+                gpu_time_warning = gr.Markdown(
+                    """
+                    <div class="warning-box">
+                    <strong>⚠️ Warning:</strong><br>
+                    Due to HuggingFace Spaces ZERO GPU quota limitations, only short video reconstruction tasks (less than 100 frames) can be completed online.
+                    <strong>💻 Recommendation:</strong><br>
+                    We strongly encourage you to deploy Aether locally for:
+                    - Processing longer video reconstruction tasks
+                    - Better performance and full access to prediction and planning tasks
+                    Visit our <a href="https://github.com/OpenRobotLab/Aether" target="_blank">GitHub repository</a> for local deployment instructions.
+                    </div>
+                    """,
+                )
+                with gr.Group(elem_classes=["task-selector"]):
+                    task = gr.Radio(
+                        ["reconstruction", "prediction", "planning"],
+                        label="Select Task",
+                        value="reconstruction",
+                        info="Choose the task you want to perform",
                     )
+                with gr.Group(elem_classes=["input-section"]):
+                    gr.Markdown("## 📥 Input", elem_classes=["task-header"])
+                    # Task-specific inputs
+                    with gr.Group(visible=True) as reconstruction_group:
+                        video_input = gr.Video(
+                            label="Upload Input Video",
+                            sources=["upload"],
+                            interactive=True,
+                            elem_id="video_input",
                         )
+                        reconstruction_examples = gr.Examples(
+                            examples=[
+                                ["assets/example_videos/bridge.mp4"],
+                                ["assets/example_videos/moviegen.mp4"],
+                                ["assets/example_videos/nuscenes.mp4"],
+                                ["assets/example_videos/veo2.mp4"],
+                            ],
+                            inputs=[video_input],
+                            label="Reconstruction Examples",
+                            examples_per_page=4,
                         )
+                    with gr.Group(visible=False) as prediction_group:
+                        image_input = gr.Image(
+                            label="Upload Start Image",
+                            type="filepath",
+                            interactive=True,
+                            elem_id="image_input",
                         )
+                        prediction_examples = gr.Examples(
+                            examples=[
+                                ["assets/example_obs/car.png"],
+                                ["assets/example_obs/cartoon.png"],
+                                ["assets/example_obs/garden.jpg"],
+                                ["assets/example_obs/room.jpg"],
+                            ],
+                            inputs=[image_input],
+                            label="Prediction Examples",
+                            examples_per_page=4,
                         )
+                    with gr.Group(visible=False) as planning_group:
+                        with gr.Row():
+                            image_input_planning = gr.Image(
+                                label="Upload Start Image",
+                                type="filepath",
+                                interactive=True,
+                                elem_id="image_input_planning",
+                            )
+                            goal_input = gr.Image(
+                                label="Upload Goal Image",
+                                type="filepath",
+                                interactive=True,
+                                elem_id="goal_input",
+                            )
+                        planning_examples = gr.Examples(
+                            examples=[
+                                ["assets/example_obs_goal/01_obs.png", "assets/example_obs_goal/01_goal.png"],
+                                ["assets/example_obs_goal/02_obs.png", "assets/example_obs_goal/02_goal.png"],
+                                ["assets/example_obs_goal/03_obs.png", "assets/example_obs_goal/03_goal.png"],
+                                ["assets/example_obs_goal/04_obs.png", "assets/example_obs_goal/04_goal.png"],
+                            ],
+                            inputs=[image_input_planning, goal_input],
+                            label="Planning Examples",
+                            examples_per_page=4,
                         )
+                    with gr.Row(visible=False) as preview_row:
+                        image_preview = gr.Image(
+                            label="Start Image Preview",
+                            elem_id="image_preview",
+                            visible=False,
+                        )
+                        goal_preview = gr.Image(
+                            label="Goal Image Preview",
+                            elem_id="goal_preview",
+                            visible=False,
+                        )
+                with gr.Group(elem_classes=["params-section", "compact-params"]):
+                    gr.Markdown("## ⚙️ Parameters", elem_classes=["task-header"])
                     with gr.Row():
                         with gr.Column(scale=1):
+                            height = gr.Dropdown(
+                                choices=[480],
+                                value=480,
+                                label="Height",
+                                info="Height of the output video",
                             )
                         with gr.Column(scale=1):
+                            width = gr.Dropdown(
+                                choices=[720],
+                                value=720,
+                                label="Width",
+                                info="Width of the output video",
                             )
                     with gr.Row():
                         with gr.Column(scale=1):
+                            num_frames = gr.Dropdown(
+                                choices=[17, 25, 33, 41],
+                                value=41,
+                                label="Number of Frames",
+                                info="Number of frames to predict",
                             )
                         with gr.Column(scale=1):
+                            fps = gr.Dropdown(
+                                choices=[8, 10, 12, 15, 24],
+                                value=12,
+                                label="FPS",
+                                info="Frames per second",
                             )
                     with gr.Row():
+                        num_inference_steps = gr.Slider(
+                            minimum=1,
+                            maximum=60,
+                            value=4,
+                            step=1,
+                            label="Inference Steps",
+                            info="Number of inference step",
+                        )
+                    sliding_window_stride = gr.Slider(
                         minimum=1,
+                        maximum=40,
+                        value=24,
                         step=1,
+                        label="Sliding Window Stride",
+                        info="Sliding window stride (window size equals to num_frames). Only used for 'reconstruction' task",
+                        visible=True,
                     )
+                    use_dynamic_cfg = gr.Checkbox(
+                        label="Use Dynamic CFG",
+                        value=True,
+                        info="Use dynamic CFG",
                         visible=False,
                     )
+                    raymap_option = gr.Radio(
+                        choices=["backward", "forward_right", "left_forward", "right"],
+                        label="Camera Movement Direction",
+                        value="forward_right",
+                        info="Direction of camera action. We offer 4 pre-defined actions for you to choose from.",
+                        visible=False,
+                    )
+                    post_reconstruction = gr.Checkbox(
+                        label="Post-Reconstruction",
+                        value=True,
+                        info="Run reconstruction after prediction for better quality",
+                        visible=False,
+                    )
+                    with gr.Accordion(
+                        "Advanced Options", open=False, visible=True, elem_classes=["advanced-options-header"]
+                    ) as advanced_options:
+                        with gr.Group(elem_classes=["advanced-section"]):
+                            with gr.Row():
+                                guidance_scale = gr.Slider(
+                                    minimum=1.0,
+                                    maximum=10.0,
+                                    value=1.0,
+                                    step=0.1,
+                                    label="Guidance Scale",
+                                    info="Guidance scale (only for prediction / planning)",
+                                )
+                            with gr.Row():
+                                seed = gr.Number(
+                                    value=42,
+                                    label="Random Seed",
+                                    info="Set a seed for reproducible results",
+                                    precision=0,
+                                    minimum=0,
+                                    maximum=2147483647,
+                                )
+                            with gr.Row():
+                                with gr.Column(scale=1):
+                                    smooth_camera = gr.Checkbox(
+                                        label="Smooth Camera",
+                                        value=True,
+                                        info="Apply smoothing to camera trajectory",
+                                    )
+                                with gr.Column(scale=1):
+                                    align_pointmaps = gr.Checkbox(
+                                        label="Align Point Maps",
+                                        value=False,
+                                        info="Align point maps across frames",
+                                    )
+                            with gr.Row():
+                                with gr.Column(scale=1):
+                                    max_depth = gr.Slider(
+                                        minimum=10,
+                                        maximum=200,
+                                        value=60,
+                                        step=10,
+                                        label="Max Depth",
+                                        info="Maximum depth for point cloud (higher = more distant points)",
+                                    )
+                                with gr.Column(scale=1):
+                                    rtol = gr.Slider(
+                                        minimum=0.01,
+                                        maximum=2.0,
+                                        value=0.2,
+                                        step=0.01,
+                                        label="Relative Tolerance",
+                                        info="Used for depth edge detection. Lower = remove more edges",
+                                    )
+                            pointcloud_save_frame_interval = gr.Slider(
+                                minimum=1,
+                                maximum=20,
+                                value=10,
+                                step=1,
+                                label="Point Cloud Frame Interval",
+                                info="Save point cloud every N frames (higher = fewer files but less complete representation)",
+                            )
+                with gr.Group(elem_classes=["run-button-container"]):
+                    run_button = gr.Button("Run Aether", variant="primary", elem_classes=["run-button"])
+            with gr.Column(elem_classes=["output-column"]):
+                with gr.Group(elem_classes=["output-panel"]):
+                    gr.Markdown("## 📤 Output", elem_classes=["task-header"])
+                    with gr.Group(elem_classes=["output-section"]):
+                        gr.Markdown("### RGB Video", elem_classes=["output-section-title"])
+                        rgb_output = gr.Video(
+                            label="RGB Output", interactive=False, elem_id="rgb_output"
+                        )
+                    with gr.Group(elem_classes=["output-section"]):
+                        gr.Markdown("### Depth Video", elem_classes=["output-section-title"])
+                        depth_output = gr.Video(
+                            label="Depth Output", interactive=False, elem_id="depth_output"
+                        )
+                    with gr.Group(elem_classes=["output-section"]):
+                        gr.Markdown("### Point Clouds", elem_classes=["output-section-title"])
+                        with gr.Row(elem_classes=["pointcloud-controls"]):
+                            pointcloud_frames = gr.Dropdown(
+                                label="Select Frame",
+                                choices=[],
+                                value=None,
+                                interactive=True,
+                                elem_id="pointcloud_frames",
+                            )
+                            pointcloud_download = gr.DownloadButton(
+                                label="Download Point Cloud",
+                                visible=False,
+                                elem_id="pointcloud_download",
+                            )
+                        model_output = gr.Model3D(
+                            label="Point Cloud Viewer", interactive=True, elem_id="model_output"
+                        )
+                        gr.Markdown(
+                            """
+                            > **Note:** 3D point clouds take a long time to visualize, and we show the keyframes only.
+                            > You can control the keyframe interval by modifying the `pointcloud_save_frame_interval`.
+                            """
+                        )
+                    with gr.Group(elem_classes=["output-section"]):
+                        gr.Markdown("### About Results", elem_classes=["output-section-title"])
+                        gr.Markdown(
+                            """
+                            #### Understanding the Outputs
+                            - **RGB Video**: Shows the predicted or reconstructed RGB frames
+                            - **Depth Video**: Visualizes the disparity maps in color (closer = red, further = blue)
+                            - **Point Clouds**: Interactive 3D point cloud with camera positions shown as colored pyramids
+                            """
+                        )
     # Event handlers
     task.change(
         fn=update_task_ui,
         inputs=[task],
         outputs=[
+            reconstruction_group,
+            prediction_group,
+            planning_group,
+            preview_row,
             num_inference_steps,
             sliding_window_stride,
             use_dynamic_cfg,
     )
     image_input.change(
+        fn=update_image_preview,
+        inputs=[image_input],
+        outputs=[image_preview]
     ).then(fn=lambda: gr.update(visible=True), inputs=[], outputs=[preview_row])
     goal_input.change(
+        fn=update_goal_preview,
+        inputs=[goal_input],
+        outputs=[goal_preview]
     ).then(fn=lambda: gr.update(visible=True), inputs=[], outputs=[preview_row])
     def update_pointcloud_frames(pointcloud_paths):
         outputs=[pointcloud_download],
     )
     # Load the model at startup
+    demo.load(lambda: build_pipeline(torch.device("cpu")), inputs=None, outputs=None)
 if __name__ == "__main__":
     os.environ["TOKENIZERS_PARALLELISM"] = "false"