File size: 3,149 Bytes
2913579
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

import subprocess
import sys

def usage():
    print("Usage: python submit.py --nodes=<NODES> --experiment=<EXPERIMENT>")
    print("  --nodes       Number of nodes to allocate for the job.")
    print("  --experiment  Name of the experiment to be used as the job name.")
    sys.exit(1)

def main():
    # Validate number of arguments
    if len(sys.argv) != 3:
        print("Error: Incorrect number of parameters.")
        usage()

    # Parse parameters
    nodes = None
    experiment = None
    for arg in sys.argv[1:]:
        if arg.startswith('--nodes='):
            nodes = arg.split('=')[1]
        elif arg.startswith('--experiment='):
            experiment = arg.split('=')[1]
        else:
            print(f"Error: Invalid parameter '{arg}'")
            usage()

    # Ensure both parameters are provided
    if not nodes or not experiment:
        print("Error: Missing required parameters.")
        usage()

    # SLURM script as a multi-line string
    slurm_script = f"""#!/bin/bash
#SBATCH --job-name={experiment}
#SBATCH --time=7-00:00:00  # 7 days
#SBATCH [email protected]
#SBATCH --mail-type=BEGIN,END
#SBATCH --account=cortex
#SBATCH --qos=cortex
#SBATCH --nodes={nodes}
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=12
#SBATCH --gpus-per-node=8
#SBATCH --mem=0
#SBATCH --exclusive  # Ensures node exclusivity
#SBATCH --signal=SIGUSR1@120  # Send SIGUSR1 120 seconds before job end to allow for checkpointing by Lightning
#SBATCH --output=/path/to/slurm_out/%x-%j.out

echo "Begin setting up env on head node ($HOSTNAME)..."

echo $(env | grep SLURM)
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
export MASTER_PORT=9929
export RDZV_ID=$SLURM_JOBID

export OMP_NUM_THREADS=$(($SLURM_CPUS_PER_TASK / $SLURM_GPUS_PER_NODE))  # Critical to ensure dataloaders use all CPUs for torchrun!

. /path/to/miniforge3/etc/profile.d/conda.sh
conda activate dust3r

cd /path/to/fast3r

# Debugging flags (optional)
export NCCL_DEBUG=INFO
# export NCCL_DEBUG_SUBSYS=ALL
export PYTHONFAULTHANDLER=1
export TORCH_DISTRIBUTED_DEBUG=INFO

echo "env setup on head node ($HOSTNAME) finished, starting srun..."

# srun --cpu-bind=none \
# torchrun \
#     --nnodes=$SLURM_NNODES --nproc_per_node=$SLURM_GPUS_PER_NODE --rdzv-id=$RDZV_ID --rdzv-backend=c10d --rdzv-endpoint=$MASTER_ADDR:$MASTER_PORT \
#     fast3r/train.py paths.run_folder_name={experiment}_$SLURM_JOBID trainer.num_nodes={nodes} logger.wandb.name={experiment}_$SLURM_JOBID experiment={experiment}

srun --cpu-bind=none \
    python fast3r/train.py paths.run_folder_name={experiment}_$SLURM_JOBID trainer.num_nodes={nodes} logger.wandb.name={experiment}_$SLURM_JOBID experiment={experiment}

echo "srun finished. Job completed on $(date)"
"""

    # Submit the SLURM job
    process = subprocess.Popen(['sbatch'], stdin=subprocess.PIPE, text=True)
    process.communicate(slurm_script)


if __name__ == "__main__":
    main()