File size: 2,439 Bytes
57746f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/bin/bash
DATE=`date -d now`
EXP=hsnet
NGPU=4
partition=g24
JOB=fewshot_${EXP}
SAVE_ROOT="save/${EXP}"
SCRIPT_ROOT="sweep_scripts/${EXP}"
mkdir -p $SCRIPT_ROOT
NCPU=$((NGPU * 10))
qos=normal  # high normal low

function print_append {
    echo $@ >> $SCRIPT
}

function slurm_append {
    echo $@ >> $SLURM
}

function print_setup {
    SAVE="${SAVE_ROOT}/${JOB}"
    SCRIPT="${SCRIPT_ROOT}/${JOB}.sh"
    SLURM="${SCRIPT_ROOT}/${JOB}.slrm"
    mkdir -p $SAVE
    echo `date -d now` $SAVE >> 'submitted.txt'
    echo "#!/bin/bash" > $SLURM
    slurm_append "#SBATCH --job-name=job1111_${JOB}"
    slurm_append "#SBATCH --output=${SAVE}/stdout.txt"
    slurm_append "#SBATCH --error=${SAVE}/stderr.txt"
    slurm_append "#SBATCH --open-mode=append"
    slurm_append "#SBATCH --signal=B:USR1@120"

    slurm_append "#SBATCH -p ${partition}"
    slurm_append "#SBATCH --gres=gpu:${NGPU}"
    slurm_append "#SBATCH -c ${NCPU}"
    slurm_append "#SBATCH -t 02-00"
    # slurm_append "#SBATCH -t 01-00"
    # slurm_append "#SBATCH -t 00-06"
    slurm_append "#SBATCH --qos=${qos}" 
    slurm_append "srun sh ${SCRIPT}"

    echo "#!/bin/bash" > $SCRIPT
    print_append "trap_handler () {"
    print_append "echo \"Caught signal: \" \$1"
    print_append "# SIGTERM must be bypassed"
    print_append "if [ "$1" = "TERM" ]; then"
    print_append "echo \"bypass sigterm\""
    print_append "else"
    print_append "# Submit a new job to the queue"
    print_append "echo \"Requeuing \" \$SLURM_JOB_ID"
    print_append "scontrol requeue \$SLURM_JOB_ID"
    print_append "fi"
    print_append "}"
    print_append "trap 'trap_handler USR1' USR1"
    print_append "trap 'trap_handler TERM' TERM"

    print_append "{"
    print_append "source activate pytorch"
    print_append "conda activate pytorch"
    print_append "export PATH=/home/boyili/programfiles/anaconda3/envs/pytorch/bin:/home/boyili/programfiles/anaconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin"
    print_append "which python"
    print_append "echo \$PATH"
    print_append "export NCCL_DEBUG=INFO"
    print_append "export PYTHONFAULTHANDLER=1"

    echo $JOB
}

function print_after {
    print_append "} & "
    print_append "wait \$!"
    print_append "sleep 610 &"
    print_append "wait \$!"
}

print_setup
print_append stdbuf -o0 -e0 \
    python train.py --log 'log_pascal'
print_after
sbatch $SLURM