File size: 7,739 Bytes
32b542e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
# Copyright (c) Facebook, Inc. and its affiliates.
import importlib
import importlib.util
import logging
import numpy as np
import os
import random
import sys
from datetime import datetime
import torch
import socket
import subprocess
import time
from . import comm
__all__ = ["seed_all_rng"]
TORCH_VERSION = tuple(int(x) for x in torch.__version__.split(".")[:2])
"""
PyTorch version as a tuple of 2 ints. Useful for comparison.
"""
def seed_all_rng(seed=None):
"""
Set the random seed for the RNG in torch, numpy and python.
Args:
seed (int): if None, will use a strong random seed.
"""
if seed is None:
seed = (
os.getpid()
+ int(datetime.now().strftime("%S%f"))
+ int.from_bytes(os.urandom(2), "big")
)
logger = logging.getLogger(__name__)
logger.info("Using a generated random seed {}".format(seed))
np.random.seed(seed)
torch.manual_seed(seed)
random.seed(seed)
# from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path
def _import_file(module_name, file_path, make_importable=False):
spec = importlib.util.spec_from_file_location(module_name, file_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
if make_importable:
sys.modules[module_name] = module
return module
def _configure_libraries():
"""
Configurations for some libraries.
"""
# An environment option to disable `import cv2` globally,
# in case it leads to negative performance impact
disable_cv2 = int(os.environ.get("DETECTRON2_DISABLE_CV2", False))
if disable_cv2:
sys.modules["cv2"] = None
else:
# Disable opencl in opencv since its interaction with cuda often has negative effects
# This envvar is supported after OpenCV 3.4.0
os.environ["OPENCV_OPENCL_RUNTIME"] = "disabled"
try:
import cv2
if int(cv2.__version__.split(".")[0]) >= 3:
cv2.ocl.setUseOpenCL(False)
except ModuleNotFoundError:
# Other types of ImportError, if happened, should not be ignored.
# Because a failed opencv import could mess up address space
# https://github.com/skvark/opencv-python/issues/381
pass
def get_version(module, digit=2):
return tuple(map(int, module.__version__.split(".")[:digit]))
# fmt: off
assert get_version(torch) >= (1, 4), "Requires torch>=1.4"
import fvcore
assert get_version(fvcore, 3) >= (0, 1, 2), "Requires fvcore>=0.1.2"
import yaml
assert get_version(yaml) >= (5, 1), "Requires pyyaml>=5.1"
# fmt: on
_ENV_SETUP_DONE = False
def setup_environment():
"""Perform environment setup work. The default setup is a no-op, but this
function allows the user to specify a Python source file or a module in
the $DETECTRON2_ENV_MODULE environment variable, that performs
custom setup work that may be necessary to their computing environment.
"""
global _ENV_SETUP_DONE
if _ENV_SETUP_DONE:
return
_ENV_SETUP_DONE = True
_configure_libraries()
custom_module_path = os.environ.get("DETECTRON2_ENV_MODULE")
if custom_module_path:
setup_custom_environment(custom_module_path)
else:
# The default setup is a no-op
pass
def setup_custom_environment(custom_module):
"""
Load custom environment setup by importing a Python source file or a
module, and run the setup function.
"""
if custom_module.endswith(".py"):
module = _import_file("detectron2.utils.env.custom_module", custom_module)
else:
module = importlib.import_module(custom_module)
assert hasattr(module, "setup_environment") and callable(module.setup_environment), (
"Custom environment module defined in {} does not have the "
"required callable attribute 'setup_environment'."
).format(custom_module)
module.setup_environment()
def check_dist_portfile():
if "SLURM_JOB_ID" in os.environ and int(os.environ["SLURM_PROCID"]) == 0: # rank==0
hostfile = "dist_url_" + os.environ["SLURM_JOBID"] + ".txt"
if os.path.exists(hostfile):
os.remove(hostfile)
def find_free_port():
s = socket.socket()
s.bind(('', 0)) # Bind to a free port provided by the host.
return s.getsockname()[1] # Return the port number assigned.
def init_distributed_mode(args):
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
if int(os.environ["RANK"])==0:
print('this task is not running on cluster!')
args.rank = int(os.environ["RANK"])
args.world_size = int(os.environ['WORLD_SIZE'])
args.gpu = int(os.environ['LOCAL_RANK'])
args.dist_url = 'env://'
os.environ['LOCAL_SIZE'] = str(torch.cuda.device_count())
addr = socket.gethostname()
elif 'SLURM_PROCID' in os.environ:
proc_id = int(os.environ['SLURM_PROCID'])
if proc_id==0:
print('Init dist using slurm!')
print("Job Id is {} on {} ".format(os.environ["SLURM_JOBID"], os.environ['SLURM_NODELIST']))
ntasks = int(os.environ['SLURM_NTASKS'])
node_list = os.environ['SLURM_NODELIST']
num_gpus = torch.cuda.device_count()
addr = subprocess.getoutput(
'scontrol show hostname {} | head -n1'.format(node_list))
jobid = os.environ["SLURM_JOBID"]
hostfile = "dist_url_" + jobid + ".txt"
if proc_id == 0:
args.tcp_port = str( find_free_port())
print('write port {} to file: {} '.format(args.tcp_port, hostfile))
with open(hostfile, "w") as f:
f.write(args.tcp_port)
else:
print('read port from file: {}'.format(hostfile))
while not os.path.exists(hostfile):
time.sleep(1)
time.sleep(2)
with open(hostfile, "r") as f:
args.tcp_port = f.read()
os.environ['MASTER_PORT'] =str(args.tcp_port)
os.environ['MASTER_ADDR'] = addr
os.environ['WORLD_SIZE'] = str(ntasks)
os.environ['RANK'] = str(proc_id)
os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
os.environ['LOCAL_SIZE'] = str(num_gpus)
args.dist_url = 'env://'
args.world_size = ntasks
args.rank = proc_id
args.gpu = proc_id % num_gpus
else:
print('Not using distributed mode')
args.distributed = False
return
args.distributed = True
torch.cuda.set_device(args.gpu)
args.dist_backend = 'nccl'
print('rank: {} addr: {} port: {}'.format(args.rank, addr, os.environ['MASTER_PORT']))
torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
torch.distributed.barrier()
if 'SLURM_PROCID' in os.environ and args.rank == 0:
if os.path.isfile(hostfile):
os.remove(hostfile)
if args.world_size >= 1:
# Setup the local process group (which contains ranks within the same machine)
assert comm._LOCAL_PROCESS_GROUP is None
num_gpus = torch.cuda.device_count()
num_machines = args.world_size // num_gpus
for i in range(num_machines):
ranks_on_i = list(range(i * num_gpus, (i + 1) * num_gpus))
print('new_group: {}'.format(ranks_on_i))
pg = torch.distributed.new_group(ranks_on_i)
if args.rank in ranks_on_i:
# if i == os.environ['SLURM_NODEID']:
comm._LOCAL_PROCESS_GROUP = pg
|