AReaL/realhf/base/gpu_utils.py

170 lines
5.5 KiB
Python

# Copyright 2025 Ant Group Inc.
# Copyright 2024 Wei Fu & Zhiyu Mei
# Licensed under the Apache License, Version 2.0 (the "License").
import dataclasses
import itertools
import os
import platform
import socket
import time
from collections import defaultdict
from typing import *
import realhf.base.logging as logging
import realhf.base.name_resolve as name_resolve
import realhf.base.names as names
import realhf.base.network as network
logger = logging.getLogger("System-GPU", "system")
GPU_DEVICES_ISOLATED = False
GLOBAL_PROCESS_GROUP_NAME = "master"
def gpu_count():
"""Returns the number of gpus on a node.
Ad-hoc to frl cluster.
"""
if platform.system() == "Darwin":
return 0
elif platform.system() == "Windows":
try:
import torch
return torch.cuda.device_count()
except ImportError:
return 0
else:
dev_directories = list(os.listdir("/dev/"))
for cnt in itertools.count():
if "nvidia" + str(cnt) in dev_directories:
continue
else:
break
return cnt
def set_cuda_device(device):
"""Set the default cuda-device.
Useful on multi-gpu nodes. Should be called in every gpu-thread.
"""
# logger.info(f"Setting device to {device}.")
if device != "cpu":
import torch
torch.cuda.set_device(device)
def reveal_pg_identity(expr_name, trial_name, worker_index):
master_group_name = names.distributed_peer(
expr_name, trial_name, GLOBAL_PROCESS_GROUP_NAME
)
name_resolve.add_subentry(master_group_name, str(worker_index))
def isolate_cuda_device(
worker_type: str,
rank: int,
world_size: int,
experiment_name: str,
trial_name: str,
):
"""Isolate CUDA_VISIBLE_DEVICES for each Slurm jobstep.
To distinguish the concept of job/jobstep/worker/task, check scheduler/slurm/utils.py.
A slurm job with multiple jobsteps will not set CUDA_VISIBLE_DEVICES properly.
For example, if a job has 2 jobsteps, each with 1 GPU, and is allocated onto GPU 0 and 1,
then CUDA_VISIBLE_DEVICES of these jobsteps will be 0,1, instead of 0 and 1.
We use this function in `apps.remote` to isolate CUDA_VISIBLE_DEVICES for each jobstep.
Args:
worker_type (str): .
rank (int): Rank of the **jobstep**.
world_size (int): Size of the **jobsteps**, aka SLURM_NPROCS. However, we may call this function
in other schedulers (e.g. local scheduler), so we don't use SLURM_NPROCS directly.
experiment_name (str): .
trial_name (str): .
"""
if not os.environ.get("CUDA_VISIBLE_DEVICES"):
return
name_resolve_identifier = f"__type_{worker_type}"
name_resolve.add_subentry(
names.distributed_local_peer(
experiment_name,
trial_name,
socket.gethostname(),
name_resolve_identifier,
),
rank,
)
name_resolve.add_subentry(
names.distributed_peer(experiment_name, trial_name, name_resolve_identifier),
rank,
)
logger.debug(
f"Worker type {worker_type} rank {rank} waiting for peers, world size {world_size}..."
)
while (
len(
name_resolve.get_subtree(
names.distributed_peer(
experiment_name, trial_name, name_resolve_identifier
)
)
)
< world_size
):
time.sleep(0.1)
# logger.info(f"Rank {rank} discovers all peers, resolving local rank...")
local_peer_name = names.distributed_local_peer(
experiment_name,
trial_name,
socket.gethostname(),
name_resolve_identifier,
)
local_peers = list(
[
str(x)
for x in sorted([int(x) for x in name_resolve.get_subtree(local_peer_name)])
]
)
# logger.info(f"Rank {rank} discovers local peers with global ranks {local_peers}")
local_peer_index = local_peers.index(str(rank))
n_local_peers = len(local_peers)
visible_devices = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
n_visible_devices = len(visible_devices)
if n_visible_devices == 0:
raise RuntimeError(
f"No visible cuda devices: {os.environ['CUDA_VISIBLE_DEVICES']}"
)
if n_visible_devices == n_local_peers:
local_gpu_id = visible_devices[local_peer_index]
elif n_visible_devices == 1:
local_gpu_id = os.environ["CUDA_VISIBLE_DEVICES"]
elif n_visible_devices % n_local_peers == 0:
# A process occupies multiple GPUs, e.g., TP generation server
factor = n_visible_devices // n_local_peers
local_gpu_id = visible_devices[factor * local_peer_index]
else:
if not os.environ.get("REAL_MODE") == "LOCAL":
raise RuntimeError(
f"Unresolvable CUDA_VISIBLE_DEVICES {os.environ['CUDA_VISIBLE_DEVICES']} on host {network.gethostname()}, "
f"local peers (global ranks) {local_peers}, local peer index {local_peer_index}."
)
# In the local mode, all processes use GPUs in a round-robin manner
devices = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
local_gpu_id = int(devices[local_peer_index % len(devices)])
# logger.info(
# f"Worker type {worker_type} rank {rank} running on host {socket.gethostname()}, "
# f"local peer index: {local_peer_index}, local gpu id {local_gpu_id}."
# )
os.environ["CUDA_VISIBLE_DEVICES"] = str(local_gpu_id)
os.environ["GPU_DEVICES_ISOLATED"] = "1"