Spaces:

TongkunGuan
/

Token-level_Text_Image_Foundation_Model

Sleeping

App Files Files Community

Token-level_Text_Image_Foundation_Model / internvl /dist_utils.py

TongkunGuan

Upload 94 files

841bef5 verified 3 months ago

raw

history blame

3.89 kB

	import os
	import socket
	import subprocess
	from datetime import timedelta

	import deepspeed
	import torch
	import torch.multiprocessing as mp
	from torch import distributed as dist

	timeout = timedelta(minutes=60)


	def _find_free_port():
	# Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501
	sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
	# Binding to port 0 will cause the OS to find an available port for us
	sock.bind(('', 0))
	port = sock.getsockname()[1]
	sock.close()
	# NOTE: there is still a chance the port could be taken by other processes.
	return port


	def _is_free_port(port):
	ips = socket.gethostbyname_ex(socket.gethostname())[-1]
	ips.append('localhost')
	with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
	return all(s.connect_ex((ip, port)) != 0 for ip in ips)


	def init_dist(launcher, backend='nccl', **kwargs):
	if mp.get_start_method(allow_none=True) is None:
	mp.set_start_method('spawn')
	if launcher == 'pytorch':
	_init_dist_pytorch(backend, **kwargs)
	elif launcher == 'mpi':
	_init_dist_mpi(backend, **kwargs)
	elif launcher == 'slurm':
	_init_dist_slurm(backend, **kwargs)
	else:
	raise ValueError(f'Invalid launcher type: {launcher}')


	def _init_dist_pytorch(backend, **kwargs):
	# TODO: use local_rank instead of rank % num_gpus
	rank = int(os.environ['RANK'])
	num_gpus = torch.cuda.device_count()
	torch.cuda.set_device(rank % num_gpus)
	# dist.init_process_group(backend=backend, **kwargs)
	deepspeed.init_distributed(dist_backend=backend)


	def _init_dist_mpi(backend, **kwargs):
	local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
	torch.cuda.set_device(local_rank)
	if 'MASTER_PORT' not in os.environ:
	# 29500 is torch.distributed default port
	os.environ['MASTER_PORT'] = '29500'
	if 'MASTER_ADDR' not in os.environ:
	raise KeyError('The environment variable MASTER_ADDR is not set')
	os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE']
	os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK']
	dist.init_process_group(backend=backend, **kwargs)


	def _init_dist_slurm(backend, port=None):
	"""Initialize slurm distributed training environment.

	If argument ``port`` is not specified, then the master port will be system
	environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
	environment variable, then a default port ``29500`` will be used.

	Args:
	backend (str): Backend of torch.distributed.
	port (int, optional): Master port. Defaults to None.
	"""
	proc_id = int(os.environ['SLURM_PROCID'])
	ntasks = int(os.environ['SLURM_NTASKS'])
	node_list = os.environ['SLURM_NODELIST']
	num_gpus = torch.cuda.device_count()
	torch.cuda.set_device(proc_id % num_gpus)
	addr = subprocess.getoutput(
	f'scontrol show hostname {node_list} \| head -n1')
	# specify master port
	if port is not None:
	os.environ['MASTER_PORT'] = str(port)
	elif 'MASTER_PORT' in os.environ:
	pass # use MASTER_PORT in the environment variable
	else:
	# if torch.distributed default port(29500) is available
	# then use it, else find a free port
	if _is_free_port(29500):
	os.environ['MASTER_PORT'] = '29500'
	else:
	os.environ['MASTER_PORT'] = str(_find_free_port())
	# use MASTER_ADDR in the environment variable if it already exists
	if 'MASTER_ADDR' not in os.environ:
	os.environ['MASTER_ADDR'] = addr
	os.environ['WORLD_SIZE'] = str(ntasks)
	os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
	os.environ['RANK'] = str(proc_id)
	# dist.init_process_group(backend=backend, timeout=timeout)
	deepspeed.init_distributed(dist_backend=backend)