File size: 3,125 Bytes
2913579 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import subprocess
import sys
def usage():
print("Usage: python submit.py --nodes=<NODES> --experiment=<EXPERIMENT>")
print(" --nodes Number of nodes to allocate for the job.")
print(" --experiment Name of the experiment to be used as the job name.")
sys.exit(1)
def main():
# Validate number of arguments
if len(sys.argv) != 3:
print("Error: Incorrect number of parameters.")
usage()
# Parse parameters
nodes = None
experiment = None
for arg in sys.argv[1:]:
if arg.startswith('--nodes='):
nodes = arg.split('=')[1]
elif arg.startswith('--experiment='):
experiment = arg.split('=')[1]
else:
print(f"Error: Invalid parameter '{arg}'")
usage()
# Ensure both parameters are provided
if not nodes or not experiment:
print("Error: Missing required parameters.")
usage()
# SLURM script as a multi-line string
slurm_script = f"""#!/bin/bash
#SBATCH --job-name={experiment}
#SBATCH --time=7-00:00:00 # 7 days
#SBATCH [email protected]
#SBATCH --mail-type=BEGIN,END
#SBATCH --account=3dfy
#SBATCH --nodes={nodes}
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=96
#SBATCH --gpus-per-node=8
#SBATCH --mem=0
#SBATCH --signal=SIGUSR1@120 # Send SIGUSR1 120 seconds before job end to allow for checkpointing by Lightning
#SBATCH --output=/path/to/slurm_out/%x-%j.out
echo "Begin setting up env on head node ($HOSTNAME)..."
echo $(env | grep SLURM)
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
export MASTER_PORT=9929
export RDZV_ID=$SLURM_JOBID
export OMP_NUM_THREADS=$(($SLURM_CPUS_PER_TASK / $SLURM_GPUS_PER_NODE)) # Critical to ensure dataloaders use all CPUs for torchrun!
. /path/to/miniforge3/etc/profile.d/conda.sh
conda activate dust3r
cd /path/to/fast3r
# Debugging flags (optional)
export NCCL_DEBUG=INFO
# export NCCL_DEBUG_SUBSYS=ALL
export PYTHONFAULTHANDLER=1
export TORCH_DISTRIBUTED_DEBUG=INFO
echo "env setup on head node ($HOSTNAME) finished, starting srun..."
# --cpu-bind=none is critical to ensure that the dataloaders can use all CPUs
srun --cpu-bind=none --jobid $SLURM_JOBID /bin/bash -c ' \
echo MASTER_ADDR: $MASTER_ADDR, MASTER_PORT: $MASTER_PORT, SLURM_PROCID: $SLURM_PROCID && \
echo local hostname: $(hostname) && \
torchrun \
--nnodes=$SLURM_NNODES --nproc_per_node=$SLURM_GPUS_PER_NODE --rdzv-id=$RDZV_ID --rdzv-backend=c10d --rdzv-endpoint=$MASTER_ADDR:$MASTER_PORT \
fast3r/train.py paths.run_folder_name={experiment}_$SLURM_JOBID trainer.num_nodes={nodes} logger.wandb.name={experiment}_$SLURM_JOBID experiment={experiment} \
'
echo "srun finished. Job completed on $(date)"
"""
# Submit the SLURM job
process = subprocess.Popen(['sbatch'], stdin=subprocess.PIPE, text=True)
process.communicate(slurm_script)
if __name__ == "__main__":
main()
|