|
|
|
|
|
|
|
|
|
|
|
|
|
import subprocess |
|
import sys |
|
|
|
def usage(): |
|
print("Usage: python submit.py --nodes=<NODES> --experiment=<EXPERIMENT>") |
|
print(" --nodes Number of nodes to allocate for the job.") |
|
print(" --experiment Name of the experiment to be used as the job name.") |
|
sys.exit(1) |
|
|
|
def main(): |
|
|
|
if len(sys.argv) != 3: |
|
print("Error: Incorrect number of parameters.") |
|
usage() |
|
|
|
|
|
nodes = None |
|
experiment = None |
|
for arg in sys.argv[1:]: |
|
if arg.startswith('--nodes='): |
|
nodes = arg.split('=')[1] |
|
elif arg.startswith('--experiment='): |
|
experiment = arg.split('=')[1] |
|
else: |
|
print(f"Error: Invalid parameter '{arg}'") |
|
usage() |
|
|
|
|
|
if not nodes or not experiment: |
|
print("Error: Missing required parameters.") |
|
usage() |
|
|
|
|
|
slurm_script = f"""#!/bin/bash |
|
#SBATCH --job-name={experiment} |
|
#SBATCH --time=7-00:00:00 # 7 days |
|
#SBATCH [email protected] |
|
#SBATCH --mail-type=BEGIN,END |
|
#SBATCH --account=3dfy |
|
#SBATCH --nodes={nodes} |
|
#SBATCH --ntasks-per-node=1 |
|
#SBATCH --cpus-per-task=96 |
|
#SBATCH --gpus-per-node=8 |
|
#SBATCH --mem=0 |
|
#SBATCH --signal=SIGUSR1@120 # Send SIGUSR1 120 seconds before job end to allow for checkpointing by Lightning |
|
#SBATCH --output=/path/to/slurm_out/%x-%j.out |
|
|
|
echo "Begin setting up env on head node ($HOSTNAME)..." |
|
|
|
echo $(env | grep SLURM) |
|
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) |
|
export MASTER_PORT=9929 |
|
export RDZV_ID=$SLURM_JOBID |
|
|
|
export OMP_NUM_THREADS=$(($SLURM_CPUS_PER_TASK / $SLURM_GPUS_PER_NODE)) # Critical to ensure dataloaders use all CPUs for torchrun! |
|
|
|
. /path/to/miniforge3/etc/profile.d/conda.sh |
|
conda activate dust3r |
|
|
|
cd /path/to/fast3r |
|
|
|
# Debugging flags (optional) |
|
export NCCL_DEBUG=INFO |
|
# export NCCL_DEBUG_SUBSYS=ALL |
|
export PYTHONFAULTHANDLER=1 |
|
export TORCH_DISTRIBUTED_DEBUG=INFO |
|
|
|
echo "env setup on head node ($HOSTNAME) finished, starting srun..." |
|
|
|
# --cpu-bind=none is critical to ensure that the dataloaders can use all CPUs |
|
srun --cpu-bind=none --jobid $SLURM_JOBID /bin/bash -c ' \ |
|
echo MASTER_ADDR: $MASTER_ADDR, MASTER_PORT: $MASTER_PORT, SLURM_PROCID: $SLURM_PROCID && \ |
|
echo local hostname: $(hostname) && \ |
|
torchrun \ |
|
--nnodes=$SLURM_NNODES --nproc_per_node=$SLURM_GPUS_PER_NODE --rdzv-id=$RDZV_ID --rdzv-backend=c10d --rdzv-endpoint=$MASTER_ADDR:$MASTER_PORT \ |
|
fast3r/train.py paths.run_folder_name={experiment}_$SLURM_JOBID trainer.num_nodes={nodes} logger.wandb.name={experiment}_$SLURM_JOBID experiment={experiment} \ |
|
' |
|
|
|
echo "srun finished. Job completed on $(date)" |
|
""" |
|
|
|
|
|
process = subprocess.Popen(['sbatch'], stdin=subprocess.PIPE, text=True) |
|
process.communicate(slurm_script) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|