Spaces:

decisionlabs
/

ecmwf-aifs-space

Build error

saburq commited on 26 days ago

Commit

30ffd75

1 Parent(s): f692b41

feat: Add CPU support and improve device handling

This commit adds flexible device support to the AIFS weather forecast app,
allowing it to run on both GPU and CPU environments. The changes improve
robustness and accessibility of the application.

Key changes:
- Add automatic device detection (CUDA/CPU)
- Make model initialization device-aware
- Update requirements.txt for both CPU and GPU installations
- Add memory optimization settings for CPU usage
- Improve logging for device selection and model initialization

Technical details:
- Introduce get_device() function for runtime hardware detection
- Make device parameter optional in run_forecast()
- Update model initialization to use detected device
- Add documentation for CPU-specific configurations

This change ensures the app can run in environments without CUDA support,
albeit at reduced performance. Memory optimization parameters are included
to help manage resource usage on CPU-only systems.

Files changed (2) hide show

app.py +24 -6
requirements.txt +6 -1

app.py CHANGED Viewed

@@ -78,8 +78,23 @@ for var in ["t", "u", "v", "w", "q", "z"]:
         var_id = f"{var}_{level}"
         VARIABLE_GROUPS["Pressure Level Variables"][var_id] = f"{var_name} at {level}hPa"
-# Load the model once at startup
-MODEL = SimpleRunner("aifs-single-mse-1.0.ckpt", device="cuda")  # Default to CUDA
 # Create and set custom temp directory
 TEMP_DIR = Path("./gradio_temp")
@@ -239,10 +254,13 @@ def plot_forecast(state, selected_variable):
     return temp_file
-def run_forecast(date: datetime.datetime, lead_time: int, device: str) -> Dict[str, Any]:
     # Get all required fields
     fields = {}
-    logger.info(f"Starting forecast for lead_time: {lead_time} hours")
     # Get surface fields
     logger.info("Getting surface fields...")
@@ -469,8 +487,8 @@ def update_interface():
         def run_and_store(lead_time):
             """Run forecast and store state"""
-            forecast_state = run_forecast(DEFAULT_DATE, lead_time, "cuda")
-            plot = plot_forecast(forecast_state, "2t")  # Default to 2t
             return forecast_state, plot
         def update_plot_from_state(forecast_state, variable):

         var_id = f"{var}_{level}"
         VARIABLE_GROUPS["Pressure Level Variables"][var_id] = f"{var_name} at {level}hPa"
+def get_device():
+    """Determine the best available device"""
+    try:
+        import torch
+        if torch.cuda.is_available():
+            logger.info("CUDA is available, using GPU")
+            return "cuda"
+        else:
+            logger.info("CUDA is not available, using CPU")
+            return "cpu"
+    except ImportError:
+        logger.info("PyTorch not found, using CPU")
+        return "cpu"
+# Update the model initialization to use the detected device
+DEVICE = get_device()
+MODEL = SimpleRunner("aifs-single-mse-1.0.ckpt", device=DEVICE)
 # Create and set custom temp directory
 TEMP_DIR = Path("./gradio_temp")
     return temp_file
+def run_forecast(date: datetime.datetime, lead_time: int, device: str = None) -> Dict[str, Any]:
+    # Use the global device if none specified
+    device = device or DEVICE
     # Get all required fields
     fields = {}
+    logger.info(f"Starting forecast for lead_time: {lead_time} hours on {device}")
     # Get surface fields
     logger.info("Getting surface fields...")
         def run_and_store(lead_time):
             """Run forecast and store state"""
+            forecast_state = run_forecast(DEFAULT_DATE, lead_time, DEVICE)  # Use global DEVICE
+            plot = plot_forecast(forecast_state, "2t")
             return forecast_state, plot
         def update_plot_from_state(forecast_state, variable):

requirements.txt CHANGED Viewed

@@ -1,4 +1,9 @@
-# torch # uncomment on cuda
 flash-attn
 anemoi-inference[huggingface]==0.4.9
 anemoi-models==0.3.1

+# For CPU-only installation, use:
+torch
+# For CUDA installation, use:
+# --extra-index-url https://download.pytorch.org/whl/cu118
+# torch==2.0.1+cu118
 flash-attn
 anemoi-inference[huggingface]==0.4.9
 anemoi-models==0.3.1