AReaL/realhf/scheduler/slurm/utils.py

# Copyright 2025 Ant Group Inc.
# Copyright 2024 Wei Fu & Zhiyu Mei
# Licensed under the Apache License, Version 2.0 (the "License").

from __future__ import (
    annotations,  # python3.7+ feature to allow self-referencing type hints
)

import collections
import dataclasses
import datetime
import getpass
import json
import math
import os
import shutil
import socket
import subprocess
from typing import Callable, Dict, List, Literal, Optional, Union

import pandas as pd

import realhf.base.cluster as cluster
import realhf.base.logging as logging
import realhf.version as version
from realhf.base.constants import LOG_ROOT
from realhf.scheduler.client import JobException, JobInfo, JobState

logger = logging.getLogger("scheduler.slurm.utils")

SQUEUE_FIELDS = [
    "JobID",
    "State",
    "SubmitTime",
    "StartTime",
    "Name",
    "NodeList",
    "UserName",
    "MaxCPUs",
    "cpus-per-task",
    "NumTasks",
    "tres-alloc",
]
STATUS_MAPPING = {
    "RUNNING": JobState.RUNNING,
    "COMPLETING": JobState.RUNNING,
    "PENDING": JobState.PENDING,
    "CANCELLED": JobState.CANCELLED,
    "FAILED": JobState.FAILED,
    "COMPLETED": JobState.COMPLETED,
    "OUT_OF_MEMORY": JobState.FAILED,
    "DEADLINE": JobState.COMPLETED,
    "TIMEOUT": JobState.COMPLETED,
}


class SlurmResourceNotEnoughException(Exception):
    pass


class InvalidGPUTypeException(Exception):
    pass


@dataclasses.dataclass
class SlurmResource:
    # a data class that represents a slurm resource quota
    mem: int = 0
    cpu: int = 0
    gpu_type: Optional[Literal["tesla", "geforce", "ppu"]] = None
    gpu: Union[float, int] = 0

    def __check_gpu_type(self, other: SlurmResource) -> str:
        self_gpu_type = None if self.gpu == 0 else self.gpu_type
        other_gpu_type = None if other.gpu == 0 else other.gpu_type
        valid_gpu_type = self_gpu_type == other_gpu_type or (
            self_gpu_type or other_gpu_type
        )
        if not valid_gpu_type:
            raise InvalidGPUTypeException(
                f"Cannot add two different gpu types {self_gpu_type}, {other_gpu_type}."
            )
        return self_gpu_type if self_gpu_type else other_gpu_type

    def __str__(self):
        return (
            "SlurmResource: \n"
            + "mem: "
            + str(self.mem)
            + " MB \n"
            + "cpu: "
            + str(self.cpu)
            + " \n"
            + "gpu: "
            + str(self.gpu)
            + " \n"
            + "gpu_type: "
            + str(self.gpu_type)
        )

    def __mul__(self, other: int) -> SlurmResource:
        assert isinstance(
            other, int
        ), "ResourceRequirement can only be multiplied by int."
        return SlurmResource(
            mem=self.mem * other,
            cpu=self.cpu * other,
            gpu=self.gpu * other,
            gpu_type=self.gpu_type,
        )

    def __rmul__(self, other: int) -> SlurmResource:
        return self.__mul__(other)

    def __add__(self, other: SlurmResource) -> SlurmResource:
        assert isinstance(
            other, SlurmResource
        ), "SlurmResource can only add another SlurmResource instance."
        return SlurmResource(
            mem=self.mem + other.mem,
            cpu=self.cpu + other.cpu,
            gpu=self.gpu + other.gpu,
            gpu_type=self.__check_gpu_type(other),
        )

    def __sub__(self, other: SlurmResource) -> SlurmResource:
        assert isinstance(
            other, SlurmResource
        ), "SlurmResource can only subtract another SlurmResource instance."
        return SlurmResource(
            mem=self.mem - other.mem,
            cpu=self.cpu - other.cpu,
            gpu=self.gpu - other.gpu,
            gpu_type=self.__check_gpu_type(other),
        )

    def __neg__(self) -> SlurmResource:
        return SlurmResource(
            mem=-self.mem, cpu=-self.cpu, gpu=-self.gpu, gpu_type=self.gpu_type
        )

    def __eq__(self, other: SlurmResource) -> bool:
        return (
            self.mem == other.mem
            and self.cpu == other.cpu
            and self.gpu == other.gpu
            and self.gpu_type == other.gpu_type
        )

    def __lt__(self, other: SlurmResource) -> bool:
        if self.gpu_type != other.gpu_type:
            if self.gpu_type is None:
                return True
            if self.gpu_type == "geforce":
                return self.gpu_type < other.gpu_type
        if self.gpu != other.gpu:
            return self.gpu < other.gpu
        if self.cpu != other.cpu:
            return self.cpu < other.cpu
        if self.mem != other.mem:
            return self.mem < other.mem

    def valid(self) -> bool:
        # check if it is a valid resource requirement
        if self.gpu_type not in ["geforce", "tesla", "ppu", None]:
            return False
        if self.mem < 0 or self.cpu < 0 or self.gpu < 0:
            return False
        return True


@dataclasses.dataclass
class SlurmLaunchInfo:
    """A SlurmLaunchInfo contains all informantion required to **launch** a
    slurm job.

    Matching one `TasksGroup` in `SchedulingConfig` and one slurm job.

    The naming conventions:
        - `job`: Literally a slurm job with a (maybe non-unique) job name and an unique job ID,
            which may contain multiple job steps and processes. It corresponds an `sbatch` or `srun` call.
            Job names are guaranteed to be unique using the scheduler within this repo.
        - `jobstep`: Literally a slurm job step with a unique job step ID, i.e., ${jobID}.${stepID},
            which corresponds to a running instance `apps.remote` script, but may still contain multiple processes.
            A job step occupies at most one GPU. Processes in the same job step must share the same GPU.
        - `wproc`: A single worker process launched by `apps.remote` script, which may occupy less than 1 GPU.
            A worker just corresponds to a process.
        - `task`: The alias of `jobstep`. It is easier to under stand this concept in the context of `srun` command.
            `--ntasks' is just the number of jobsteps. We use the alternative term `jobstep` to avoid confusion.

    Attributes:
        run_name (str): Identifier of this run, typically ${exp_name}_${trial_name}.
        worker_type (str): Type of workers to be launched, e.g. model_worker, data_worker, etc.
        worker_submission_idx (int): For heterogeneous scheduling, we submit jobs of the same worker_type to slurm
            for multiple times. `worker_submission_idx` is used to distinguish them, so the (global) slurm job name will
            be ${run_name}:${worker_type}:${worker_submission_idx}.
        wprocs_in_job: The number of worker processes in this slurm job (of all job steps).
        n_jobsteps (int): The number of job steps of this slurm job. This is also the group size of the multiprog file.
            Will be resolved automatically according to GPU requirement.
        wprocs_per_jobstep: The number of worker processes in each job step, as well as the number of sub-processes
            spawned by `apps.remote`. Will be resolved automatically according to GPU requirement.

        resource_requirement (SlurmResource): The resource requirement of this job, including all job steps.
        cmd (str): The command to be executed.
        container_image (str): In current PPU setup, container_image should match the format provided by singularity.
            If the image is a file, this string should be the path. If the image is a remote docker image,
            this string should be of format 'docker://<image>'.
        container_mounts (str): .
        env_vars (dict): .
        node_type (str): .
        nodelist (str): .
        exclude (str): .
        partition (str, optional): default to "all".
        time_limit (str, optional): Slurm job time limit.
        begin (str, optional): Scheduled worker start time.
        deadline (str, optional): Scheduled worker end time.
        hostfile (bool): Whether to use hostfile for `--distribution=arbitrary` scheduling.
        hostfile_content (str, optional): The content of the hostfile.
        multiprog (bool): Whether to use multiprog file for `--multi-prog` job submission.
        multiprog_content (str, optional): The content of the multiprog file.
    """

    run_name: str
    exper_name: str
    trial_name: str
    worker_type: str
    worker_submission_idx: int
    wprocs_in_job: int
    job_group_id: str
    job_group_index: str

    resource_requirement: SlurmResource
    cmd: str
    container_image: str
    container_mounts: str
    env_vars: dict
    node_type: str
    nodelist: str
    exclude: str
    partition: Optional[str] = "all"
    time_limit: Optional[str] = None
    begin: Optional[str] = None
    deadline: Optional[str] = None
    # hostfile
    hostfile: bool = True
    hostfile_content: Optional[str] = None
    # multiprog options, override cmd
    multiprog: bool = True
    multiprog_content: Optional[str] = None

    n_jobsteps: int = None
    wprocs_per_jobstep: int = None

    job_info: Optional[JobInfo] = None

    def __post_init__(self):
        """Resolve fractional GPU resource requirement."""
        gpu_per_worker = self.resource_requirement.gpu
        # assert gpu_per_worker <= 1 and gpu_per_worker >= 0
        if gpu_per_worker < 1 and gpu_per_worker > 0:
            self.resource_requirement.gpu = 1
            self.wprocs_per_jobstep = math.floor(1 / gpu_per_worker)
            self.resource_requirement.cpu *= self.wprocs_per_jobstep
            self.resource_requirement.mem *= self.wprocs_per_jobstep
            self.n_jobsteps = math.ceil(self.wprocs_in_job / self.wprocs_per_jobstep)
            logger.info(f"Resolved fractional GPU requirement for {self.slurm_name}")
            logger.info(
                f"GPU per worker {gpu_per_worker}, workers per jobstep (process size in `apps.remote`) {self.wprocs_per_jobstep}, "
                f"number of jobsteps (instance of running `apps.remote`) {self.n_jobsteps}"
            )
        else:
            self.n_jobsteps = self.wprocs_in_job
            self.wprocs_per_jobstep = 1

    @property
    def slurm_name(self) -> str:
        return f"{self.run_name}:{self.worker_type}:{self.worker_submission_idx}"

    @property
    def slurm_id(self) -> Optional[str]:
        if self.job_info:
            return self.job_info.slurm_id
        else:
            return None

    @property
    def log_path(self) -> str:
        return os.path.join(
            LOG_ROOT,
            self.exper_name,
            self.trial_name,
            f"{self.worker_type}-{self.worker_submission_idx}.log",
        )

    @property
    def multiprog_path(self) -> str:
        path = os.path.join(
            LOG_ROOT,
            self.exper_name,
            self.trial_name,
            "slurm",
            "multiprog",
            f"{self.worker_type}-{self.worker_submission_idx}.multiprog",
        )
        os.makedirs(os.path.dirname(path), exist_ok=True)
        return path

    @property
    def hostfile_path(self) -> str:
        path = os.path.join(
            LOG_ROOT,
            self.exper_name,
            self.trial_name,
            "slurm",
            "hostfile",
            f"{self.worker_type}-{self.worker_submission_idx}.hostfile",
        )
        os.makedirs(os.path.dirname(path), exist_ok=True)
        return path

    def show_log(self):
        try:
            terminal_columns = os.get_terminal_size().columns
        except OSError:
            terminal_columns = shutil.get_terminal_size().columns
        logger.info(
            f"Showing log of slurm job: {self.worker_type}-{self.worker_submission_idx}\n\n{'-'*terminal_columns}"
        )
        subprocess.Popen(["tail", "-n50", self.log_path]).wait(timeout=3)
        logger.info(
            f"End of log: {self.worker_type}-{self.worker_submission_idx}\n\n{'-'*terminal_columns}"
        )

    def update(self):
        job_infos = query_jobs(slurm_names=[self.slurm_name])
        job_infos = sorted(
            job_infos,
            key=lambda x: parse_formatted_time(x.submit_time),
            reverse=True,
        )
        self.job_info = job_infos[0] if len(job_infos) > 0 else None
        if self.job_info:
            return self.job_info.state
        else:
            return None

    def cancel(self, signal: Literal["SIGINT", "SIGKILL"] = "SIGKILL"):
        cancel_jobs(slurm_names=[self.slurm_name], signal=signal)
        self.job_info = JobInfo(name=self.slurm_name, state=JobState.CANCELLED)

    def __str__(self):
        s = f"SlurmLaunchInfo [{self.slurm_name}] \n"
        s += f"Resources: [\n{self.resource_requirement}\n]\n"
        s += f"Multiprog Filepath: [{self.multiprog_path}]\n"
        s += f"Multiprog Content: [\n{self.multiprog_content}\n]\n"
        s += f"Hostfile Filepath: [{self.hostfile_path}]\n"
        s += f"Hostfile Content: [\n{self.hostfile_content}\n]\n"
        if self.job_info is None:
            job_info_str = "None"
        else:
            job_info_str = "\n".join(
                [f"{k}: {v}" for k, v in self.job_info.__dict__.items()]
            )
        s += f"Runtime JobInfo: [\n{job_info_str}\n]\n"
        env_var_str = "\n".join([f"{k}: {v}" for k, v in self.env_vars.items()])
        s += f"Env vars: [\n{env_var_str}\n]\n"
        return s

    def commit(self):
        os.makedirs(os.path.dirname(self.log_path), exist_ok=True, mode=0o775)

        ntasks = self.n_jobsteps
        mem = self.resource_requirement.mem
        cpu = self.resource_requirement.cpu
        gpu = self.resource_requirement.gpu

        cmd = self.cmd

        # assert gpu == 1 or gpu == 0, "Slurm job GPU requirement should be resolved to a integer."
        gpu_type = self.resource_requirement.gpu_type

        if self.multiprog:
            with open(self.multiprog_path, "w") as f:
                f.write(self.multiprog_content)
        if self.hostfile:
            with open(self.hostfile_path, "w") as f:
                f.write(self.hostfile_content)

        logger.info(
            f'Allocating {ntasks} jobstep(s) "{self.worker_type}" submssion index {self.worker_submission_idx}'
            f" with {cpu} cpu, {gpu} gpu and {mem} MB memory."
        )
        logger.info(f"To check the output, run \n\t`tail -f {self.log_path}`.")

        # Setup sbatch
        # head
        gres_line = ""
        if gpu >= 1:
            assert (gpu * ntasks) % cluster.spec.n_gpus_per_node == 0
            # In current slurm cluster setup, we can only use "--gres" to
            # allocate PPUs per node. There are no options to allocate customized
            # gres per tasks.
            if gpu_type == "ppu":
                gres_line = f"--gres={gpu_type}:{cluster.spec.n_gpus_per_node}"
            else:
                gres_line = f"--gres=gpu:{cluster.spec.n_gpus_per_node}"

        srun_env = os.environ.copy()
        job_metadata = {
            "user": srun_env.get("EMAILPREFIX", ""),
            "version": version.__version__,
            "branch": version.__branch__,
            "commit": version.__commit__,
            "dirty": version.__is_dirty__,
            "job_group_id": self.job_group_id,
            "job_group_index": self.job_group_index,
        }
        job_metadata_json = json.dumps(job_metadata)

        lines = [
            "#!/bin/bash",
            f"#SBATCH --job-name={self.slurm_name}",
            f"#SBATCH --output={self.log_path}",
            "#SBATCH --open-mode=append",
            f"#SBATCH --ntasks={ntasks}",
            f"#SBATCH {gres_line}",
            f"#SBATCH --cpus-per-task={cpu}",
            f"#SBATCH --mem-per-cpu={mem // max(1, cpu)}M",
            "#SBATCH --distribution=arbitrary" if self.hostfile else "",
            # f'#SBATCH --nodelist={spec.nodelist}' if spec.nodelist is not None else "",
            # f'#SBATCH --exclude={spec.exclude}' if spec.exclude is not None else "",
            f"#SBATCH --time={self.time_limit}" if self.time_limit else "",
            f"#SBATCH --begin={self.begin}" if self.begin else "",
            f"#SBATCH --deadline={self.deadline}" if self.deadline else "",
            f"#SBATCH --comment='{job_metadata_json}'",
        ]

        if self.hostfile:
            srun_env["SLURM_HOSTFILE"] = self.hostfile_path
        # Setup step command.
        # add current directory into container mounts to ensure editable mode for realhf package
        srun_flags = [
            f"--ntasks={ntasks}",
            f"--cpus-per-task={cpu}",
            gres_line,
            f"--mem-per-cpu={mem // max(1, cpu)}",
            f"--multi-prog {self.multiprog_path}" if self.multiprog else "",
        ]

        # The `-K` option ensures all job steps within the same job id would be killed if
        # one of them exited with error. This is necessary for recovery.
        if self.multiprog:
            srun_cmd = (
                f'srun --mpi=pmi2 -K -l {" ".join(srun_flags)} {self.multiprog_path}'
            )
        else:
            srun_cmd = f'srun --mpi=pmi2 -K -l {" ".join(srun_flags)} {cmd}'

        lines += [
            'echo "[Runner] StartTime: $(date -u)"',
            'echo "[Runner] Host: $(hostname)"',
            "echo '[Runner] Command: {}'".format(srun_cmd),
            "echo '[Runner] Log: {}'".format(self.log_path),
            'echo "[Runner] CudaVisible: $CUDA_VISIBLE_DEVICES"',
            'echo "[Runner] CudaMpsPerc: $CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"',
            srun_cmd,
            "RETCODE=$?",
            'echo "[Runner] FinishTime: $(date -u)"',
            'echo "[Runner] RetCode: $RETCODE"',
            'echo "[Runner] ------------"',
            "exit $RETCODE",
        ]

        script_strs = "\n".join(list(filter(lambda x: x, lines))) + "\n"
        script = script_strs.encode("ascii")

        def pad_output_str_to_length(s: str, pad_s: str, length: int):
            assert len(pad_s) == 1
            assert len(s) + 2 <= length
            n_pads = (length - len(s) - 2) // 2
            return pad_s * n_pads + " " + s + " " + pad_s * n_pads

        with open(self.log_path, "a") as f:
            f.write(pad_output_str_to_length("SBATCH SCRIPT BEGIN", "=", 80) + "\n")
            f.write(script_strs)
            f.write(pad_output_str_to_length("SBATCH SCRIPT END", "=", 80) + "\n")
            f.write(pad_output_str_to_length("SBATCH JOB INFO BEGIN", "=", 80) + "\n")
            f.write(str(self))
            f.write(pad_output_str_to_length("SBATCH JOB INFO END", "=", 80) + "\n")
            f.write(pad_output_str_to_length("JOB OUTPUT BEGIN", "=", 80) + "\n")
        r = (
            subprocess.check_output(
                ["sbatch", "--parsable"], input=script, env=srun_env
            )
            .decode("ascii")
            .strip()
        )
        self.job_info = JobInfo(name=self.slurm_name, state=JobState.PENDING)


def parse_formatted_time(time_string: str) -> int:
    if time_string == "N/A":
        return -1
    d = datetime.datetime.strptime(time_string, "%Y-%m-%dT%H:%M:%S")
    return int(datetime.datetime.timestamp(d))


def unparse_formatted_time(timestamp: int) -> str:
    if timestamp == -1:
        return "N/A"
    d = datetime.datetime.fromtimestamp(timestamp)
    return d.strftime("%Y-%m-%dT%H:%M:%S")


# slurm command execute and output parsing
def query_jobs(
    slurm_names: Optional[List[str]] = None,
    slurm_ids: Optional[List[str]] = None,
    status: str = "all",
    delimiter: str = "__PSI__",
) -> List[JobInfo]:
    squeue_format = f":.{delimiter},".join(SQUEUE_FIELDS)
    cmd = ["squeue", "-O", squeue_format, f"-t{status}"]
    if slurm_names is not None:
        cmd += ["-n", ",".join(slurm_names)]
    if slurm_ids is not None:
        cmd += ["-j", ",".join([str(s) for s in slurm_ids])]
    output = (
        subprocess.check_output(cmd, stderr=subprocess.DEVNULL).decode("ascii").strip()
    )
    rs = []
    for line in output.split("\n")[1:]:
        job_id, state, submit_time, start_time, slurm_name, nodelist, *_ = line.split(
            delimiter
        )
        rs.append(
            JobInfo(
                name=slurm_name,
                state=STATUS_MAPPING[state],
                host=nodelist,
                submit_time=submit_time,
                start_time=start_time,
                slurm_id=job_id.strip(),
            )
        )
    return rs


def cancel_jobs(
    slurm_names: Optional[List[str]] = None,
    slurm_ids: Optional[List[str]] = None,
    signal: Literal["SIGINT", "SIGKILL"] = "SIGKILL",
):
    assert (
        slurm_names is not None or slurm_ids is not None
    ), "Must specify slurm_names or slurm_ids."
    assert not (
        slurm_names and slurm_ids
    ), "Cannot specify both slurm_names and slurm_ids."
    cmd = ["scancel", "-s", signal]
    if slurm_names is not None:
        cmd += ["-n", ",".join(slurm_names)]
    elif slurm_ids is not None:
        cmd += ["-j", ",".join([str(s) for s in slurm_ids])]
    subprocess.check_call(cmd)
    logger.info(
        f"Cancelled Slurm job with signal {signal}: "
        f"slurm identifiers {slurm_names if slurm_ids is None else slurm_ids}"
    )


def _parse_output_status_line(status):
    assert status.startswith("State=")
    status = status.split(" ")[0]
    status = status.split("=")[1]
    return status.split("+")


def _parse_output_tres_line(tres):
    tres = tres.split("=", maxsplit=1)[1]
    tres = tres.split(",")
    res = SlurmResource()
    if len(tres) == 0 or (len(tres) == 1 and tres[0] == ""):
        return SlurmResource()
    for t in tres:
        if t.startswith("mem"):
            if t.endswith("M"):
                res.mem = int(t.split("=")[1].strip("M"))
            elif t.endswith("G"):
                res.mem = int(float(t.split("=")[1].strip("G")) * 1024)
            elif t.endswith("T"):
                res.mem = int(float(t.split("=")[1].strip("T")) * 1024 * 1024)
            else:
                raise ValueError("Unknown memory unit.")
        elif t.startswith("cpu"):
            res.cpu = int(t.split("=")[1])
        elif t.startswith("gres/gpu"):
            prefix, sgpu = t.split("=")
            if ":" in prefix:
                res.gpu_type = prefix.split(":")[1]
            res.gpu = int(sgpu)
        elif t.startswith("gres/ppu"):
            prefix, sgpu = t.split("=")
            res.gpu_type = "ppu"
            res.gpu = int(sgpu)
        elif t.startswith("billing"):
            # slurm default resource to limit number of
            # tasks in one node
            pass
        else:
            raise NotImplementedError(f"Unknown resource type: {repr(t)}")
    return res


def available_hostnames(
    node_type: Optional[List[str]] = None,
    nodelist: Optional[str] = None,
    exclude: Optional[str] = None,
    partition: Optional[str] = None,
) -> List[str]:
    sinfo_cmd = 'sinfo -o "%N" --noheader'
    if partition:
        sinfo_cmd += f" --partition={partition}"
    all_nodelist: str = (
        subprocess.check_output(sinfo_cmd, shell=True).decode("utf-8").strip()
    )
    all_hostnames: List[str] = (
        subprocess.check_output(
            [
                "scontrol",
                "show",
                "hostnames",
                all_nodelist,
            ]
        )
        .decode("utf-8")
        .strip()
        .split("\n")
    )

    if nodelist is not None:
        valid_hostnames: List[str] = (
            subprocess.check_output(
                [
                    "scontrol",
                    "show",
                    "hostnames",
                    nodelist,
                ]
            )
            .decode("utf-8")
            .strip()
            .split("\n")
        )
    else:
        valid_hostnames = all_hostnames

    if exclude is not None:
        excluded_hostnames: List[str] = (
            subprocess.check_output(
                [
                    "scontrol",
                    "show",
                    "hostnames",
                    exclude,
                ]
            )
            .decode("utf-8")
            .strip()
            .split("\n")
        )
        for hn in excluded_hostnames:
            if hn in valid_hostnames:
                valid_hostnames.remove(hn)

    invalid_hostnames = []
    for hn in valid_hostnames:
        if hn not in all_hostnames:
            logger.warning(
                f"Invalid host name: {hn}. Maybe it is not in this partition/cluster."
            )
            invalid_hostnames.append(hn)

    for hn in invalid_hostnames:
        valid_hostnames.remove(hn)

    return list(
        filter(
            lambda x: cluster.node_name_is_node_type(x, node_type),
            valid_hostnames,
        )
    )


def get_all_node_resources() -> Dict[str, SlurmResource]:
    """Execute `scontrol show node` to get all node resources available in the
    slurm cluster.

    Return a list of SlurmResource
    """
    o = subprocess.check_output(["scontrol", "show", "node"]).decode("utf-8")
    nodes = o.split("\n\n")
    all_rres = {}
    for node in nodes:
        if len(node) <= 1:
            continue
        ls = node.split("\n")
        node_name = ls[0].split(" ")[0].split("=")[1]
        ctres = SlurmResource()
        atres = SlurmResource()
        for l in ls:
            l = l.strip("\n").strip()
            if l.startswith("State"):
                status = _parse_output_status_line(l)
                if any(
                    x in status
                    for x in ["DOWN", "DRAIN", "NOT_RESPONDING", "COMPLETING"]
                ):
                    break
            if l.startswith("CfgTRES"):
                ctres = _parse_output_tres_line(l)
            if l.startswith("AllocTRES"):
                atres = _parse_output_tres_line(l)
        if ctres.gpu_type is None:
            ctres.gpu_type = cluster.spec.gpu_type_from_node_name(node_name)
        if atres.gpu_type is None:
            atres.gpu_type = ctres.gpu_type
        rres = ctres - atres
        if rres.valid():
            all_rres[node_name] = rres
        else:
            all_rres[node_name] = SlurmResource(gpu_type=ctres.gpu_type)

    return all_rres


def resource_to_string(resources: Dict[str, SlurmResource]) -> str:
    resource_list = [
        {
            **{"NodeName": k},
            **{
                field.name: getattr(r, field.name)
                for field in r.__dataclass_fields__.values()
            },
        }
        for k, r in resources.items()
    ]
    return pd.DataFrame(resource_list).to_string(index=False)


def allocate_resources(
    infos: List[SlurmLaunchInfo],
    strategy: Literal["empty_first", "allocated_first"] = "empty_first",
) -> List[SlurmLaunchInfo]:
    """Allocate all slurm task specs, fill in the hostfile field of the specs.

    All slurm tasks are scheduled in pack. There are two choices of allocating
    strategies. The first is `empty_first`, which means we first allocate
    tasks to nodes with more free resources. The second is `allocated_first`,
    which allocate tasks to nodes with less free resources without exceeding
    resource capacity.
    """
    assert strategy in ["empty_first", "allocated_first"]
    all_resources = get_all_node_resources()
    # sorted by requirements in descending order
    infos = sorted(
        infos, key=lambda x: x.n_jobsteps * x.resource_requirement, reverse=True
    )
    prioritized_hosts = set()
    for info_idx, info in enumerate(infos):
        valid_hostnames = available_hostnames(
            node_type=info.node_type,
            nodelist=info.nodelist,
            exclude=info.exclude,
            partition=info.partition,
        )
        valid_hostnames = list(filter(lambda x: x in all_resources, valid_hostnames))
        prioritized_resources = {
            hn: all_resources[hn] for hn in valid_hostnames if hn in prioritized_hosts
        }
        other_resources = {
            hn: all_resources[hn]
            for hn in valid_hostnames
            if hn not in prioritized_hosts
        }
        # sorted by available resources according to chosen strategy
        prioritized_resources = sorted(
            prioritized_resources.items(),
            key=lambda x: x[1],
            reverse=strategy != "allocated_first",
        )
        # if all of the allocated nodes cannot satisfy the requirement,
        # find the new node according to chosen strategy
        other_resources = sorted(
            other_resources.items(),
            key=lambda x: x[1],
            reverse=strategy != "allocated_first",
        )
        valid_resources = prioritized_resources + other_resources
        task_left = info.n_jobsteps
        allocated = dict()
        for hostname, resource in valid_resources:
            tmp = task_left
            while task_left > 0:
                # In current slurm cluster GRES setting,
                # we can only allocate tasks in the granularity of nodes
                # (16 PPUs/8 GPUs by default)
                batched_requirement = info.resource_requirement
                batched_ntasks = 1
                gpu_per_task = info.resource_requirement.gpu
                if gpu_per_task > 0:
                    assert (
                        task_left * gpu_per_task % cluster.spec.n_gpus_per_node == 0
                    ), (task_left, gpu_per_task)
                    assert (
                        cluster.spec.n_gpus_per_node % gpu_per_task == 0
                    ), gpu_per_task
                    batched_ntasks = int(cluster.spec.n_gpus_per_node // gpu_per_task)
                    batched_requirement = batched_ntasks * info.resource_requirement
                try:
                    resource = resource - batched_requirement
                except InvalidGPUTypeException:
                    # InvalidGPUTypeException will be raised when
                    # `resource` and `batched_requirement`
                    # do not have the same GPU type.
                    break
                if not resource.valid():
                    resource += batched_requirement
                    break
                task_left -= batched_ntasks
                prioritized_hosts.add(hostname)
            if tmp - task_left > 0:
                allocated[hostname] = tmp - task_left
            all_resources[hostname] = resource
        if task_left > 0:
            if (
                info.resource_requirement.gpu_type == "ppu"
                and info.resource_requirement.gpu > 0
            ):
                logger.warning(
                    "For PPU resources, we can only allocate tasks in the "
                    f"granularity of nodes ({cluster.spec.n_gpus_per_node} PPUs)"
                )
            logger.warning(
                f'Unable to allocate {info.n_jobsteps} Jobs with name "{info.slurm_name}". '
                f"Resource Requirement of this job is: {dataclasses.asdict(info.resource_requirement)}. "
                f"Valid resources for this job is "
                f"(according to NodeType={info.node_type}, NodeList={info.nodelist}, "
                f"and Exclude={info.exclude}):\n {resource_to_string({k: v for k, v in get_all_node_resources().items() if k in valid_hostnames})}"
            )
            for pinfo in infos[:info_idx]:
                if (
                    len(
                        set(pinfo.hostfile_content.split("\n")).intersection(
                            set(valid_hostnames)
                        )
                    )
                    == 0
                ):
                    continue
                palloc = collections.defaultdict(lambda: 0)
                for _n in pinfo.hostfile_content.split("\n"):
                    palloc[_n] += 1
                logger.warning(
                    f'Found previous job "{pinfo.slurm_name}" (ntasks={pinfo.n_jobsteps}) '
                    f"has been allocated to the same set of nodes. "
                    f"Resource requirement of this job is: {dataclasses.asdict(pinfo.resource_requirement)}, "
                    f"allocation of this job is {dict(palloc)}."
                )
            raise SlurmResourceNotEnoughException()
        hostlist = []
        for hostname, task_num in allocated.items():
            hostlist += [hostname] * task_num
        info.hostfile_content = "\n".join(hostlist)
    return infos


def show_tesla():
    all_rres = get_all_node_resources()
    hostname = socket.gethostname()
    for k in available_hostnames(node_type=["a100"]):
        print(k, all_rres[k])


def show_all():
    all_rres = get_all_node_resources()
    for k, v in all_rres.items():
        print(k, v)


if __name__ == "__main__":
    show_all()