PullRequest: 252 [Feature] Fix constants initialization. (#122)

Merge branch fw/gh/fix-init-constants of git@code.alipay.com:inclusionAI/AReaL.git into gh https://code.alipay.com/inclusionAI/AReaL/pull_requests/252?tab=comment Reviewed-by: 晓雷 <meizhiyu.mzy@antgroup.com> * remove LOG_ROOT * remove MODEL_SAVE_PATH * remove PARAM_REALLOC_PATH, DATASET_CACHE * prepare for testing * prepare for testing * ready for run * local run * tests mainly pass * format * amend cluster.py * fix
2025-06-23 12:52:49 +08:00 · 2025-06-23 12:52:49 +08:00 · 1ec1399f19
parent b63eea9d07
commit 1ec1399f19
70 changed files with 705 additions and 1471 deletions
--- a/examples/run_sync_ppo.sh
+++ b/examples/run_sync_ppo.sh
@ -4,10 +4,10 @@ python3 training/main_sync_ppo.py \
    allocation_mode=sglang.d4p1m1+d2p2m1 \
    cluster.fileroot=/storage/testing/experiments \
    actor.type._class=qwen3 \
-    actor.path=/storage/testing/models/Qwen__Qwen3-1.7B \
+    actor.path=Qwen/Qwen3-1.7B \
    ref.type._class=qwen3 \
-    ref.path=/storage/testing/models/Qwen__Qwen3-1.7B \
-    dataset.path=/storage/testing/dataset/boba_106k_0319.jsonl \
+    ref.path=Qwen/Qwen3-1.7B \
+    dataset.path=hf-dataset://inclusionAI/AReaL-RL-Data/data/boba_106k_0319.jsonl \
    dataset.train_bs_n_seqs=32 \
    group_size=8 \
    ppo.gen.max_new_tokens=4096 \
--- a/realhf/api/cli_args.py
+++ b/realhf/api/cli_args.py
@ -5,7 +5,9 @@ from typing import Dict, List, Optional, Tuple, Type, Union

 from omegaconf import MISSING

-from realhf.base import pkg_version
+from realhf.base import logging, pkg_version
+
+logger = logging.getLogger("CLI args")

 ## Data and datasets. ##

@ -351,10 +353,6 @@ class SGLangConfig:
            tp_size=tp_size,
            # Because we have set CUDA_VISIBLE_DEVICES to a single GPU in each process
            base_gpu_id=base_gpu_id,
-            file_storage_path=os.path.join(
-                constants.SGLANG_CACHE_PATH,
-                f"sglang_storage{server_index}",
-            ),
            # Data parallelism
            dp_size=1,  # TODO: check whether we require SGLang dp
            load_balance_method="round_robin",
@ -870,6 +868,30 @@ def get_user_tmp():
    return user_tmp


+@dataclass
+class NameResolveConfig:
+    type: str = field(
+        default="nfs",
+        metadata={
+            "help": "Type of the distributed KV store for name resolving.",
+            "choices": ["nfs", "etcd3", "ray"],
+        },
+    )
+    nfs_record_root: str = field(
+        default="/tmp/areal/name_resolve",
+        metadata={
+            "help": "Record root for NFS name resolving. Should be available in all nodes."
+        },
+    )
+    etcd3_addr: str = field(
+        default="localhost:2379", metadata={"help": "Address of the ETCD3 server."}
+    )
+    ray_actor_name: str = field(
+        default="ray_kv_store",
+        metadata={"help": "Name of the distributed Ray KV store."},
+    )
+
+
@dataclass
 class ClusterSpecConfig:
    config_path: str = field(
@ -878,6 +900,10 @@ class ClusterSpecConfig:
            "help": "JSON config path. If not given, use the following CLI args."
        },
    )
+    name_resolve: NameResolveConfig = field(
+        default_factory=NameResolveConfig,
+        metadata={"help": "Name resolving configuration."},
+    )
    cluster_name: str = field(
        default="local",
        metadata={"help": "Name of the cluster. Used to set specific environs."},
--- a/realhf/api/core/config.py
+++ b/realhf/api/core/config.py
@ -6,7 +6,6 @@ import dataclasses
 import enum
 from typing import Any, Dict, List, Optional

-import realhf.base.cluster as cluster
 import realhf.base.topology as topology


@ -140,7 +139,7 @@ class ModelShardID:
        )

    def __repr__(self):
-        n = cluster.spec.suffix_n_digits
+        n = len(str(self.topo.world_size()))
        return f"{self.model_name}@pp{self.pp_rank:0{n}d}@tp{self.tp_rank:0{n}d}@dp{self.dp_rank:0{n}d}"

    def __hash__(self):
--- a/realhf/api/core/data_api.py
+++ b/realhf/api/core/data_api.py
@ -40,7 +40,6 @@ from pydantic import field_validator, model_validator
 from realhf.api.cli_args import MicroBatchSpec
 from realhf.api.core import config as config_api
 from realhf.base import constants, datapack, logging, seeding
-from realhf.base.cluster import spec as cluster_spec
 from realhf.utils import load_hf_or_local_file

 logger = logging.getLogger("api.data")
@ -754,11 +753,11 @@ def get_shuffle_indices(seed: int, size: int):

 def load_shuffle_split_dataset(
    util: DatasetUtility,
-    dataset_path: str,
+    dataset_path: Optional[str] = None,
    dataset_builder: Optional[Callable[[], List[Dict[str, str]]]] = None,
 ):
-    dataset_path = load_hf_or_local_file(dataset_path)
    if dataset_path is not None:
+        dataset_path = load_hf_or_local_file(dataset_path)
        if dataset_path.endswith(".jsonl"):
            with open(dataset_path, "r") as f:
                data = [json.loads(ff) for ff in f]
@ -808,17 +807,12 @@ def make_dataset(
    dp_rank: int,
    world_size: int,
    tokenizer_or_tokenizer_name: Union[transformers.PreTrainedTokenizerFast, str],
-    experiment_name: str,
-    trial_name: str,
-    cache_root: Optional[str] = None,
 ) -> torch.utils.data.Dataset:
    if isinstance(cfg, str):
        cfg = config_api.DatasetAbstraction(type_=cfg)

    if isinstance(tokenizer_or_tokenizer_name, str):
        tokenizer = load_hf_tokenizer(tokenizer_or_tokenizer_name)
-    elif tokenizer_or_tokenizer_name is None:
-        raise RuntimeError("tokenizer_or_tokenizer_name cannot be None.")
    else:
        tokenizer = tokenizer_or_tokenizer_name
    util = DatasetUtility(
@ -827,46 +821,8 @@ def make_dataset(
        world_size,
        tokenizer,
    )
-
-    if cache_root is None:
-        dataset_cls = ALL_DATASET_CLASSES[cfg.type_]
-        return dataset_cls(util=util, **cfg.args)
-
-    # Create and check cache path.
-    if not cache_root.startswith(cluster_spec.fileroot) and not cache_root.startswith(
-        "/home"
-    ):
-        raise ValueError(
-            f"Data cache path {cache_root} should be /home or under {cluster_spec.fileroot}."
-        )
-    if "_" in experiment_name or "_" in trial_name:
-        raise ValueError(f"Invalid experiment/trial name.")
-
-    output_path = os.path.join(
-        cache_root,
-        experiment_name,
-        trial_name,
-        cfg.type_,
-        f"seed{seed}",
-        f"world_size{world_size}",
-        f"rank{dp_rank}",
-    )
-    os.makedirs(output_path, exist_ok=True)
-
-    fname = "dataset.pt"
-    cache_found = os.path.isfile(os.path.join(output_path, fname))
-
-    tik = time.perf_counter()
-    if not cache_found:
-        logger.info(f"No data cache found for rank {dp_rank}. Create it from scratch.")
-        dataset = ALL_DATASET_CLASSES[cfg.type_](seed, dp_rank, world_size, **cfg.args)
-        torch.save(dataset, os.path.join(output_path, fname))
-    else:
-        logger.info(f"Rank {dp_rank} find existing data cache, load it.")
-        dataset = torch.load(os.path.join(output_path, fname))
-    logger.info(f"Dataset creation/loading time: {time.perf_counter() - tik:.3f}s")
-
-    return dataset
+    dataset_cls = ALL_DATASET_CLASSES[cfg.type_]
+    return dataset_cls(util=util, **cfg.args)


 def gather_stat(src: List[Dict]) -> Dict:
--- a/realhf/api/core/system_api.py
+++ b/realhf/api/core/system_api.py
@ -25,7 +25,6 @@ from realhf.api.core.config import (
    StandaloneModelShardAbstraction,
 )
 from realhf.base import constants, topology
-from realhf.base.cluster import spec as cluster_spec


 class ExpStatus(Enum):
@ -49,66 +48,6 @@ class Scheduling:
    begin: Optional[str] = None  # see "--begin" option for format
    deadline: Optional[str] = None  # see "--deadline" option for format

-    @staticmethod
-    def master_worker_default(**kwargs):
-        return Scheduling(
-            **{
-                "cpu": 16,
-                "mem": 20 * 1024,
-                "gpu": 0,
-                "container_image": cluster_spec.cpu_image,
-                **kwargs,
-            }
-        )
-
-    @staticmethod
-    def model_worker_default(**kwargs):
-        return Scheduling(
-            **{
-                "cpu": 2,
-                "gpu": 1,
-                "mem": 60 * 1024,
-                "container_image": cluster_spec.gpu_image,
-                **kwargs,
-            }
-        )
-
-    @staticmethod
-    def generation_server_default(**kwargs):
-        return Scheduling(
-            **{
-                "cpu": 4,
-                "gpu": 1,
-                "mem": 60 * 1024,
-                "container_image": cluster_spec.gpu_infer_image,
-                **kwargs,
-            }
-        )
-
-    @staticmethod
-    def gserver_manager_default(**kwargs):
-        return Scheduling(
-            **{
-                "cpu": 4,
-                "gpu": 0,
-                "mem": 10 * 1024,
-                "container_image": cluster_spec.gpu_image,
-                **kwargs,
-            }
-        )
-
-    @staticmethod
-    def rollout_worker_default(**kwargs):
-        return Scheduling(
-            **{
-                "cpu": 4,
-                "gpu": 0,
-                "mem": 20 * 1024,
-                "container_image": cluster_spec.gpu_image,
-                **kwargs,
-            }
-        )
-

@dataclasses.dataclass
 class WorkerInformation:
@ -159,8 +98,6 @@ class ModelWorker:
    # dataset, for source model workers
    tokenizer_name_or_path: Optional[str] = None
    datasets: Optional[List[Union[str, DatasetAbstraction]]] = None
-    use_dataset_cache: bool = False
-    dataset_cahce_root: str = constants.DATASET_CACHE_PATH
    shuffle_dataset: bool = True
    cuda_cache_cleanliness: bool = True
    cuda_cache_clear_freq: int = 10
@ -215,8 +152,6 @@ class RolloutWorker:
    env: EnvServiceAbstraction
    agent: AgentAbstraction
    datasets: List[Union[str, DatasetAbstraction]]
-    use_dataset_cache: bool = False
-    dataset_cahce_root: str = constants.DATASET_CACHE_PATH
    worker_info: WorkerInformation = None


@ -290,16 +225,9 @@ class ExperimentConfig:

        assert constants.trial_name() is not None
        assert constants.experiment_name() is not None
-        graph_path = os.path.join(
-            constants.LOG_ROOT,
-            constants.experiment_name(),
-            constants.trial_name(),
-            "dataflow_graph.png",
-        )
-        os.makedirs(os.path.dirname(graph_path), exist_ok=True)
        # If verbose set to True here, every worker will print the graph once
        # due to lazy init on workers.
-        G = dfg.build_graph(self.model_rpcs, verbose=False, graph_path=graph_path)
+        G = dfg.build_graph(self.model_rpcs, verbose=False)
        for rpc in self.model_rpcs:
            rpc._G = G

@ -549,4 +477,12 @@ def register_experiment(name, cls):

 def make_experiment(name) -> Experiment:
    cls = ALL_EXPERIMENT_CLASSES[name]
-    return cls()
+    args = cls()
+    if args.cluster.config_path:
+        from realhf.base.cluster import load_spec_from_file
+
+        load_spec_from_file(args.cluster)
+    from realhf.base import name_resolve
+
+    name_resolve.reconfigure(args.cluster.name_resolve)
+    return args
--- a/realhf/api/quickstart/device_mesh.py
+++ b/realhf/api/quickstart/device_mesh.py
@ -8,9 +8,8 @@ from typing import List, Optional, Tuple, Union

 import numpy as np

-from realhf.api.cli_args import ParallelismConfig
+from realhf.api.cli_args import ClusterSpecConfig, ParallelismConfig
 from realhf.api.core.dfg import MFCDef
-from realhf.base.cluster import spec as cluster_spec
 from realhf.base.slurm_utils import are_ones_contiguous, parse_nodelist


@ -76,7 +75,6 @@ class DeviceMesh:
        return device_mesh

    def __post_init__(self):
-        n = cluster_spec.suffix_n_digits
        assert self._is_valid_mapping()

    def __eq__(self, other: "DeviceMesh"):
@ -179,7 +177,10 @@ class DeviceMesh:


 def make_device_mesh_from_name(
-    global_mesh_name: str, name: str, n_gpus_per_node: int = 8
+    cluster: ClusterSpecConfig,
+    global_mesh_name: str,
+    name: str,
+    n_gpus_per_node: int = 8,
 ):
    """
    DeviceMesh name format: <prefix><node_indices>[:<gpu_ids>]
@ -191,8 +192,8 @@ def make_device_mesh_from_name(

    Note: cluster device mesh name must occupy entire nodes.
    """
-    prefix = cluster_spec.node_name_prefix
-    node_list = parse_nodelist(global_mesh_name, prefix)
+    prefix = cluster.node_name_prefix
+    node_list = parse_nodelist(cluster, global_mesh_name, prefix)
    n_nodes = len(node_list)

    gpu_ids = None
@ -202,7 +203,7 @@ def make_device_mesh_from_name(
        assert all(gpu_id < n_gpus_per_node for gpu_id in gpu_ids)
    else:
        node_names = name
-    node_names = parse_nodelist(node_names, prefix)
+    node_names = parse_nodelist(cluster, node_names, prefix)
    mapping = np.zeros((n_nodes, n_gpus_per_node), dtype=np.int32)
    if gpu_ids is None:
        node_indices = [node_list.index(node_name) for node_name in node_names]
--- a/realhf/api/quickstart/entrypoint.py
+++ b/realhf/api/quickstart/entrypoint.py
@ -16,24 +16,22 @@ from hydra.core.config_store import ConfigStore
 from omegaconf import MISSING, OmegaConf

 import realhf.api.core.system_api as system_api
-from realhf.base.constants import init_constants
+from realhf.base.constants import (
+    QUICKSTART_EXPR_CACHE_PATH,
+    get_log_path,
+    get_save_path,
+)
 from realhf.base.ray_utils import check_ray_availability
 from realhf.base.slurm_utils import check_slurm_availability


 def kind_reminder(config_name, logger, args):
-    from realhf.base.constants import LOG_ROOT, MODEL_SAVE_ROOT
-
    logger.info(f"Running {config_name} experiment.")
+    logger.info(f"Logs will be dumped to {get_log_path(args)}")
    logger.info(
-        f"Logs will be dumped to {os.path.join(LOG_ROOT, args.experiment_name, args.trial_name)}"
-    )
-    logger.info(
-        f"Experiment configs will be dumped to {os.path.join(LOG_ROOT, args.experiment_name, args.trial_name, 'config.yaml')}"
-    )
-    logger.info(
-        f"Model checkpoints will be saved to {os.path.join(MODEL_SAVE_ROOT, args.experiment_name, args.trial_name)}"
+        f"Experiment configs will be dumped to {os.path.join(get_log_path(args), 'config.yaml')}"
    )
+    logger.info(f"Model checkpoints will be saved to {get_save_path(args)}")

    if args.mode == "slurm":
        slurm_available = check_slurm_availability()
@ -82,12 +80,7 @@ def register_quickstart_exp(config_name: str, exp_cls: Callable):
            trial_name = args.trial_name
        from realhf.apps.main import main_start, main_stop

-        init_constants(args)
-        from realhf.base.constants import LOG_ROOT, QUICKSTART_EXPR_CACHE_PATH
-
-        config_save_path = os.path.join(
-            LOG_ROOT, args.experiment_name, args.trial_name, "config.yaml"
-        )
+        config_save_path = os.path.join(get_log_path(args), "config.yaml")
        os.makedirs(os.path.dirname(config_save_path), exist_ok=True)
        with open(config_save_path, "w") as f:
            yaml.dump(
--- a/realhf/apps/main.py
+++ b/realhf/apps/main.py
@ -94,7 +94,7 @@ def main_start(args, job_group_id: str = "", recover_count: int = 0):
        raise RuntimeError("Experiment initial setup failed.") from e

    evaluator = (
-        AutomaticEvaluator(exp_cfg.evaluator, exp_cfg.wandb, exp_cfg.swanlab)
+        AutomaticEvaluator(exp_cfg, exp_cfg.evaluator, exp_cfg.wandb, exp_cfg.swanlab)
        if exp_cfg.auto_eval
        else None
    )
@ -116,7 +116,7 @@ def main_start(args, job_group_id: str = "", recover_count: int = 0):
        is_recover_run = recover_count > 0
    if args.recover_mode == "auto":
        try:
-            recover.discover_ckpt(args.experiment_name, args.trial_name)
+            recover.discover_ckpt(experiment)
            is_recover_run = True
        except recover.InValidRecoverCkpt as e:
            logger.warning(
@ -127,7 +127,7 @@ def main_start(args, job_group_id: str = "", recover_count: int = 0):
            is_recover_run = False
    if is_recover_run:
        recover_ckpt_path, model_ckpt_dirs, recover_info = recover.discover_ckpt(
-            args.experiment_name, args.trial_name
+            experiment
        )
        logger.info(f"Will load recover info from {recover_ckpt_path}.")
        logger.info(f"Will load model checkpoints from {model_ckpt_dirs}.")
@ -138,19 +138,9 @@ def main_start(args, job_group_id: str = "", recover_count: int = 0):
        )
    save_recover_states = args.recover_mode != "disabled"

-    cluster_spec_path = os.environ.get("CLUSTER_SPEC_PATH", "")
-    if not cluster_spec_path:
-        logger.info(
-            "Environment variable CLUSTER_SPEC_PATH is not set. "
-            "Will use the fileroot specified in CLI args. "
-        )
-    else:
-        logger.warning(
-            "Environment variable CLUSTER_SPEC_PATH is set. "
-            "Will overwrite the cluster spec in CLI args. "
-        )
    # set env vars
    BASE_ENVIRONS = constants.get_env_vars(
+        experiment,
        REAL_MODE=args.mode.upper(),
        REAL_RECOVER_RUN="1" if is_recover_run else "0",
        REAL_SAVE_RECOVER_STATES="1" if save_recover_states else "0",
@ -160,10 +150,7 @@ def main_start(args, job_group_id: str = "", recover_count: int = 0):

    # setup experiments
    sched = sched_client.make(
-        mode=scheduler_mode(args.mode),
-        expr_name=expr_name,
-        trial_name=trial_name,
-        schedule_strategy=args.schedule_strategy,
+        experiment,
        evaluator=evaluator,
        job_group_id=job_group_id,
        job_group_index=recover_count,
@ -302,11 +289,8 @@ def main_start(args, job_group_id: str = "", recover_count: int = 0):


 def main_stop(args):
-    sched = sched_client.make(
-        mode=scheduler_mode(args.mode),
-        expr_name=args.experiment_name,
-        trial_name=args.trial_name,
-    )
+    experiment = config_package.make_experiment(args.experiment_name)
+    sched = sched_client.make(experiment)
    sched.find_all()
    sched.stop_all()

--- a/realhf/apps/quickstart.py
+++ b/realhf/apps/quickstart.py
@ -16,7 +16,6 @@ from rich.panel import Panel

 from realhf.api.cli_args import console, highlighter, print_config_help
 from realhf.api.quickstart.entrypoint import QUICKSTART_CONFIG_CLASSES, QUICKSTART_FN
-from realhf.base.cluster import spec as cluster_spec
 from realhf.base.importing import import_module
 from realhf.base.prologue import (
    PROLOGUE_EXTERNAL_CONFIG_NAME,
@ -36,7 +35,6 @@ import_module(
    str(pathlib.Path(__file__).resolve().parent.parent / "experiments" / "async_exp"),
    re.compile(r".*_exp\.py$"),
 )
-import realhf.experiments.benchmark.profile_exp


 def print_help(exp_type):
@ -144,7 +142,7 @@ def prepare_hydra_config(name: str, prologue_path: str):
    config = OmegaConf.load(prologue_path)
    experiment_name = get_experiment_name(config.get("experiment_name"))
    trial_name = get_trial_name(config.get("trial_name"))
-    config_dir = f"{cluster_spec.fileroot}/configs/{getpass.getuser()}/{experiment_name}/{trial_name}"
+    config_dir = f"{config.cluster.fileroot}/configs/{getpass.getuser()}/{experiment_name}/{trial_name}"
    os.makedirs(config_dir, exist_ok=True)

    config.pop(PROLOGUE_EXTERNAL_CONFIG_NAME, {})
--- a/realhf/apps/remote.py
+++ b/realhf/apps/remote.py
@ -21,7 +21,7 @@ from omegaconf import OmegaConf
 multiprocessing.set_start_method("spawn", force=True)

 from realhf.api.quickstart.entrypoint import QUICKSTART_CONFIG_CLASSES
-from realhf.base import gpu_utils, importing, logging
+from realhf.base import gpu_utils, importing, logging, name_resolve
 from realhf.version import get_full_version_with_dirty_description

 logger = logging.getLogger("Main-Workers")
@ -61,7 +61,6 @@ def main_worker(args):
    import realhf.api.core.system_api as system_api

    experiment = system_api.make_experiment(name=args.experiment_name)
-    constants.init_constants(experiment)

    worker_index_start = args.jobstep_id * args.wprocs_per_jobstep + args.wproc_offset
    worker_index_end = min(
@ -166,6 +165,8 @@ def main_controller(args):
    constants.set_experiment_trial_names(args.experiment_name, args.trial_name)
    _patch_external_impl(args.experiment_name, args.trial_name)

+    experiment = system_api.make_experiment(args.experiment_name)
+
    logger.debug("Running controller with args: %s", args)
    assert not args.experiment_name.startswith("/"), args.experiment_name
    try:
@ -177,10 +178,6 @@ def main_controller(args):
        experiment_name=args.experiment_name,
        trial_name=args.trial_name,
    )
-    experiment = system_api.make_experiment(args.experiment_name)
-
-    # Initialize cluster infor from ENV or CLI args.
-    constants.init_constants(experiment)

    controller.start(
        experiment=experiment,
--- a/realhf/base/cluster.py
+++ b/realhf/base/cluster.py
@ -3,143 +3,23 @@
 # Licensed under the Apache License, Version 2.0 (the "License").

 import json
-import os
 from typing import TYPE_CHECKING, Dict

 if TYPE_CHECKING:
-    from realhf.api.cli_args import BaseExperimentConfig
+    from realhf.api.cli_args import ClusterSpecConfig


-class ClusterSpec:
-    def __init__(self):
-        # Set default values to comfort ray
-        from realhf.api.cli_args import BaseExperimentConfig
+def load_spec_from_file(config: "ClusterSpecConfig"):
+    with open(config.config_path, "r") as f:
+        spec: Dict = json.load(f)

-        self.load_spec_from_args(BaseExperimentConfig())
-
-        self.__loaded = False
-
-    def load_spec_from_file(self, file_path: str):
-        if not os.path.exists(file_path):
-            raise FileNotFoundError(f"Cluster spec file not found: {file_path}")
-
-        with open(file_path, "r") as f:
-            spec: Dict = json.load(f)
-
-        self.__cluster_type = spec["cluster_type"]
-        self.__cluster_name = spec["cluster_name"]
-        self.__fileroot = spec["fileroot"]
-        self.__gpu_type = spec.get("gpu_type", None)
-        self.__mount = spec.get("default_mount", None)
-        self.__gpu_image = spec.get("gpu_image", None)
-        self.__gpu_infer_image = spec.get("gpu_infer_image", self.__gpu_image)
-        self.__cpu_image = spec.get("cpu_image", None)
-        self.__node_name_prefix = spec.get("node_name_prefix", "slurmd-")
-        # self.__n_nodes decides number of digits in slurm hostnames
-        # e.g. if __n_nodes = 32, then the hostnames will be slurmd-{:02d}
-        #      if __n_nodes = 128, then the hostnames will be slurmd-{:03d}
-        self.__n_nodes = int(spec.get("n_nodes", 32))
-        self.__n_gpus_per_node = int(spec.get("n_gpus_per_node", 8))
-        assert isinstance(self.__n_nodes, int)
-
-        self.__loaded = True
-
-    def load_spec_from_args(self, args: "BaseExperimentConfig"):
-        self.__cluster_type = args.mode
-        self.__cluster_name = args.cluster.cluster_name
-        self.__fileroot = args.cluster.fileroot
-        self.__gpu_type = args.cluster.gpu_type
-        self.__mount = args.cluster.mount
-        self.__gpu_image = args.cluster.gpu_image
-        self.__gpu_infer_image = args.cluster.gpu_infer_image
-        self.__cpu_image = args.cluster.cpu_image
-        self.__node_name_prefix = args.cluster.node_name_prefix
-        self.__n_nodes = args.cluster.n_nodes
-        self.__n_gpus_per_node = args.cluster.n_gpus_per_node
-        self.__loaded = True
-
-    @property
-    def name(self):
-        assert self.__loaded
-        return self.__cluster_name
-
-    @property
-    def gpu_type(self):
-        assert self.__loaded
-        return self.__gpu_type
-
-    @property
-    def fileroot(self) -> str:
-        """Return the root directory of the file system in the cluster.
-
-        When running experiments, files such as logs, checkpoints,
-        caches will be saved under this directory.
-        """
-        assert self.__loaded
-        return self.__fileroot
-
-    @fileroot.setter
-    def fileroot(self, root: str):
-        # Used for testing
-        self.__fileroot = root
-
-    @property
-    def mount(self) -> str:
-        """Directories that should be mounted to container that runs
-        workers."""
-        assert self.__loaded
-        return self.__mount
-
-    @property
-    def gpu_image(self) -> str:
-        """Return the default image for containers of GPU trainer workers."""
-        assert self.__loaded
-        return self.__gpu_image
-
-    @property
-    def gpu_infer_image(self) -> str:
-        """Return the default image for containers of GPU inference workers."""
-        assert self.__loaded
-        return self.__gpu_infer_image
-
-    @property
-    def cpu_image(self) -> str:
-        """Return the default image for containers of CPU workers."""
-        assert self.__loaded
-        return self.__cpu_image
-
-    @property
-    def node_name_prefix(self) -> str:
-        """Return the prefix of node names in slurm format."""
-        assert self.__loaded
-        return self.__node_name_prefix
-
-    @property
-    def n_nodes(self) -> int:
-        return self.__n_nodes
-
-    @property
-    def suffix_n_digits(self) -> int:
-        return len(str(self.__n_nodes))
-
-    @property
-    def n_gpus_per_node(self) -> int:
-        return self.__n_gpus_per_node
-
-    @property
-    def cluster_type(self) -> str:
-        return self.__cluster_type
-
-
-spec = ClusterSpec()
-
-
-def init_cluster_spec(args: "BaseExperimentConfig"):
-    global spec
-    CLUSTER_SPEC_PATH = os.environ.get("CLUSTER_SPEC_PATH", "")
-    if args.cluster.config_path:
-        spec.load_spec_from_file(args.cluster.config_path)
-    elif CLUSTER_SPEC_PATH:
-        spec.load_spec_from_file(CLUSTER_SPEC_PATH)
-    else:
-        spec.load_spec_from_args(args)
+    config.cluster_name = spec["cluster_name"]
+    config.fileroot = spec["fileroot"]
+    config.gpu_type = spec.get("gpu_type", None)
+    config.mount = spec.get("default_mount", None)
+    config.gpu_image = spec.get("gpu_image", None)
+    config.gpu_infer_image = spec.get("gpu_infer_image", config.gpu_image)
+    config.cpu_image = spec.get("cpu_image", None)
+    config.node_name_prefix = spec.get("node_name_prefix", "slurmd-")
+    config.n_nodes = int(spec.get("n_nodes", 32))
+    config.n_gpus_per_node = int(spec.get("n_gpus_per_node", 8))
--- a/realhf/base/constants.py
+++ b/realhf/base/constants.py
@ -4,13 +4,10 @@

 # log format constants
 import contextlib
-import copy
 import datetime
 import getpass
 import os
 import pathlib
-import subprocess
-from collections import defaultdict
 from pathlib import Path
 from typing import *

@ -69,139 +66,115 @@ TORCH_FORCE_CPU = False

 # constants in experiment instance scope
 LOCAL_CACHE_DIR = "/tmp/realhf"
+QUICKSTART_EXPR_CACHE_PATH = str(Path(__file__).parent.parent.parent / ".cache")
+os.makedirs(QUICKSTART_EXPR_CACHE_PATH, exist_ok=True)
+PORT_LOCKFILE_ROOT = os.getenv("AREAL_PORT_LOCKFILE_ROOT", "/tmp/areal/ports/")
+os.makedirs(PORT_LOCKFILE_ROOT, exist_ok=True)
+
 PYTORCH_KERNEL_CACHE_PATH = (
    f"{LOCAL_CACHE_DIR}/.cache/{getpass.getuser()}/torch/kernels"
 )
 TRITON_CACHE_PATH = f"{LOCAL_CACHE_DIR}/.cache/{getpass.getuser()}/triton"
-QUICKSTART_EXPR_CACHE_PATH = str(Path(__file__).parent.parent.parent / ".cache")
 os.makedirs(PYTORCH_KERNEL_CACHE_PATH, exist_ok=True)
 os.makedirs(TRITON_CACHE_PATH, exist_ok=True)
-os.makedirs(QUICKSTART_EXPR_CACHE_PATH, exist_ok=True)
-
-# Global constants that should be initialized after cluster initialization.
-MODEL_SAVE_ROOT = None
-LOG_ROOT = None
-RECOVER_ROOT = None
-SLURM_LOCK_FILE_NAME = None
-PORT_LOCK_FILE_ROOT = None
-DATASET_CACHE_PATH = None
-PROFILER_CACHE_PATH = None
-PARAM_REALLOC_PATH = None
-SGLANG_CACHE_PATH = None
-TORCH_EXTENSIONS_DIR = None
-BASE_ENVIRONS = None


-def init_constants(args: "BaseExperimentConfig"):
-    from realhf.base.cluster import init_cluster_spec
-    from realhf.base.cluster import spec as cluster_spec
+def get_cache_path(args: "BaseExperimentConfig") -> str:
+    path = f"{args.cluster.fileroot}/.cache/{getpass.getuser()}/{args.experiment_name}/{args.trial_name}"
+    os.makedirs(path, exist_ok=True)
+    return path

-    init_cluster_spec(args)

-    globals_dict = globals()  # Get module's global variables
+def get_log_root(args: "BaseExperimentConfig") -> str:
+    log_root = f"{args.cluster.fileroot}/logs/{getpass.getuser()}"
+    os.makedirs(log_root, exist_ok=True)
+    return log_root

-    kwargs = dict(
-        MODEL_SAVE_ROOT=f"{cluster_spec.fileroot}/checkpoints/{getpass.getuser()}",
-        LOG_ROOT=f"{cluster_spec.fileroot}/logs/{getpass.getuser()}",
-        RECOVER_ROOT=f"{cluster_spec.fileroot}/recover/{getpass.getuser()}",
-        SLURM_LOCK_FILE_NAME=f"{cluster_spec.fileroot}/logs/slurm_scheduler.lock",
-        PORT_LOCK_FILE_ROOT=f"{cluster_spec.fileroot}/.cache/{getpass.getuser()}/ports",
-        DATASET_CACHE_PATH=f"{cluster_spec.fileroot}/.cache/{getpass.getuser()}/datasets",
-        PROFILER_CACHE_PATH=f"{cluster_spec.fileroot}/.cache/{getpass.getuser()}/profiler",
-        PARAM_REALLOC_PATH=f"{cluster_spec.fileroot}/.cache/{getpass.getuser()}/param_realloc",
-        SGLANG_CACHE_PATH=f"{cluster_spec.fileroot}/.cache/{getpass.getuser()}/sglang",
-        TORCH_EXTENSIONS_DIR=(
-            f"{cluster_spec.fileroot}/.cache/{getpass.getuser()}/torch/extensions"
-        ),
-    )
-    BASE_ENVIRONS = {
-        # "PYTHONPATH": "/realhf",
-        "REAL_IS_REMOTE": "1",
-        # "NCCL_P2P_DISABLE": "1",
-        # "NCCL_IB_DISABLE": "1",
-        "TRANSFORMERS_OFFLINE": "1",
-        "PYTORCH_KERNEL_CACHE_PATH": PYTORCH_KERNEL_CACHE_PATH,
-        "TRITON_CACHE_DIR": TRITON_CACHE_PATH,
-        "TOKENIZERS_PARALLELISM": "true",
-        "TORCH_EXTENSIONS_DIR": kwargs["TORCH_EXTENSIONS_DIR"],
-        # "TORCH_DISTRIBUTED_DEBUG": "DETAIL",
-        # "NCCL_SOCKET_IFNAME": "ibp71s0",
-        # "GLOO_SOCKET_IFNAME": "ibp71s0",
-        # "TORCH_USE_CUDA_DSA": "1",
-        # "NCCL_IGNORE_DISABLED_P2P": "1",
-        # "CUDA_LAUNCH_BLOCKING": "1",  # NOTE: CUDAGraph Capturing will not work if CUDA_LAUNCH_BLOCKING is set to 1.
-        # "NCCL_COMM_BLOCKING": "1",  # NOTE: CUDAGraph Capturing will not work if NCCL_COMM_BLOCKING is set to 1.
-        # "NCCL_BLOCKING_WAIT": "1",  # NOTE: CUDAGraph Capturing will not work if NCCL_BLOCKING_WAIT is set to 1.
-        # "TORCH_SHOW_CPP_STACKTRACES": "1",
-        # "RAY_DEDUP_LOGS": "0",  # disable ray log deduplication
-        "CUDA_DEVICE_MAX_CONNECTIONS": "1",
-        "OMP_NUM_THREADS": str(min(os.cpu_count(), 32)),
-        # torch.distributed.all_reduce does not free the input tensor until
-        # the synchronization point. This causes the memory usage to grow
-        # as the number of all_reduce calls increases. This env var disables
-        # this behavior.
-        # Related issue:
-        # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        # Whether to enable time mark to plot timelines.
-        "REAL_CUDA_TMARK": os.getenv("REAL_CUDA_TMARK", "0"),
-        "REAL_DUMP_TRACE": os.getenv("REAL_DUMP_TRACE", "0"),
-        "REAL_DUMP_MEMORY": os.getenv("REAL_DUMP_MEMORY", "0"),
-        "REAL_GPU_MEMORY_KILL_THRESHOLD": os.getenv(
-            "REAL_GPU_MEMORY_KILL_THRESHOLD", "1.0"
-        ),
-        "LC_ALL": "C",
-        "LANG": "C",
-        "NCCL_DEBUG": "WARN",
-    }
-    kwargs["BASE_ENVIRONS"] = BASE_ENVIRONS
-    # Set PPU-specific environment variables for stable training.
-    if cluster_spec.name == "wa180":
-        logger.warning("Detected PPU. Amending PPU-related environment variables.")
-        PPU_ENVIRONS = {
-            "NCCL_DEBUG": "INFO",
-            "NCCL_IB_DISABLE": "1",
-            "NCCL_DEBUG_SUBSYS": "INIT",
-            "NCCL_SET_THREAD_NAME": "1",
-            "NCCL_IB_HCA": "",
-            "NCCL_SOCKET_IFNAME": "bond0",
-            "PCCL_STATE_MONITOR_DISABLE": "1",
-        }
-        kwargs["BASE_ENVIRONS"].update(PPU_ENVIRONS)
-    elif cluster_spec.name == "na132":
-        # Specific environment variable for h800 cluster na132
-        NV_ENVIRONS = {
-            "NCCL_SOCKET_IFNAME": "bond0",
-            "NCCL_NET_PLUGIN": "",
-            "NCCL_IB_GID_INDEX": "3",
-            "NCCL_IB_TIMEOUT": "2",
-            "NCCL_IB_RETRY_CNT": "7",
-            "NCCL_IB_SL": "5",
-            "NCCL_IB_TC": "136",
-            "NCCL_IB_HCA": "mlx5_bond",
-            "NCCL_IB_QPS_PER_CONNECTION": "8",
-            "NCCL_SET_THREAD_NAME": "1",
-            "NCCL_DEBUG_SUBSYS": "INIT,TUNING,GRAPH",
-        }
-        kwargs["BASE_ENVIRONS"].update(NV_ENVIRONS)

-    for key, value in kwargs.items():
-        if key not in globals_dict:
-            raise ValueError(f"Invalid constant name: {key}")
-        if globals_dict[key] is not None and globals_dict[key] != value:
-            raise RuntimeError(f"Constant '{key}' already initialized!")
-        globals_dict[key] = value
+def get_log_path(args: "BaseExperimentConfig") -> str:
+    log_path = f"{args.cluster.fileroot}/logs/{getpass.getuser()}/{args.experiment_name}/{args.trial_name}"
+    os.makedirs(log_path, exist_ok=True)
+    return log_path

-    # make directories if does not exist
-    os.makedirs(globals_dict["PARAM_REALLOC_PATH"], exist_ok=True)
-    os.makedirs(globals_dict["MODEL_SAVE_ROOT"], exist_ok=True)
-    os.makedirs(globals_dict["LOG_ROOT"], exist_ok=True)
-    os.makedirs(globals_dict["RECOVER_ROOT"], exist_ok=True)
-    os.makedirs(globals_dict["DATASET_CACHE_PATH"], exist_ok=True)
-    os.makedirs(globals_dict["PROFILER_CACHE_PATH"], exist_ok=True)
-    os.makedirs(globals_dict["TORCH_EXTENSIONS_DIR"], exist_ok=True)
-    os.makedirs(globals_dict["PORT_LOCK_FILE_ROOT"], exist_ok=True)
-    os.makedirs(globals_dict["SGLANG_CACHE_PATH"], exist_ok=True)
+
+def get_save_root(args: "BaseExperimentConfig") -> str:
+    path = f"{args.cluster.fileroot}/checkpoints/{getpass.getuser()}"
+    os.makedirs(path, exist_ok=True)
+    return path
+
+
+def get_save_path(args: "BaseExperimentConfig") -> str:
+    path = f"{args.cluster.fileroot}/checkpoints/{getpass.getuser()}/{args.experiment_name}/{args.trial_name}"
+    os.makedirs(path, exist_ok=True)
+    return path
+
+
+def get_param_realloc_path(args: "BaseExperimentConfig"):
+    path = f"{args.cluster.fileroot}/.cache/{getpass.getuser()}/param_realloc"
+    os.makedirs(path, exist_ok=True)
+    return path
+
+
+BASE_ENVIRONS = {
+    # "PYTHONPATH": "/realhf",
+    "REAL_IS_REMOTE": "1",
+    # "NCCL_P2P_DISABLE": "1",
+    # "NCCL_IB_DISABLE": "1",
+    "TRANSFORMERS_OFFLINE": "1",
+    "TOKENIZERS_PARALLELISM": "true",
+    "PYTORCH_KERNEL_CACHE_PATH": PYTORCH_KERNEL_CACHE_PATH,
+    "TRITON_CACHE_DIR": TRITON_CACHE_PATH,
+    # "TORCH_DISTRIBUTED_DEBUG": "DETAIL",
+    # "NCCL_SOCKET_IFNAME": "ibp71s0",
+    # "GLOO_SOCKET_IFNAME": "ibp71s0",
+    # "TORCH_USE_CUDA_DSA": "1",
+    # "NCCL_IGNORE_DISABLED_P2P": "1",
+    # "CUDA_LAUNCH_BLOCKING": "1",  # NOTE: CUDAGraph Capturing will not work if CUDA_LAUNCH_BLOCKING is set to 1.
+    # "NCCL_COMM_BLOCKING": "1",  # NOTE: CUDAGraph Capturing will not work if NCCL_COMM_BLOCKING is set to 1.
+    # "NCCL_BLOCKING_WAIT": "1",  # NOTE: CUDAGraph Capturing will not work if NCCL_BLOCKING_WAIT is set to 1.
+    # "TORCH_SHOW_CPP_STACKTRACES": "1",
+    # "RAY_DEDUP_LOGS": "0",  # disable ray log deduplication
+    "CUDA_DEVICE_MAX_CONNECTIONS": "1",
+    "OMP_NUM_THREADS": str(min(os.cpu_count(), 32)),
+    # torch.distributed.all_reduce does not free the input tensor until
+    # the synchronization point. This causes the memory usage to grow
+    # as the number of all_reduce calls increases. This env var disables
+    # this behavior.
+    # Related issue:
+    # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
+    "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
+    # Whether to enable time mark to plot timelines.
+    "REAL_DUMP_TRACE": os.getenv("REAL_DUMP_TRACE", "0"),
+    "REAL_DUMP_MEMORY": os.getenv("REAL_DUMP_MEMORY", "0"),
+    "REAL_GPU_MEMORY_KILL_THRESHOLD": os.getenv(
+        "REAL_GPU_MEMORY_KILL_THRESHOLD", "1.0"
+    ),
+    "LC_ALL": "C",
+    "LANG": "C",
+    "NCCL_DEBUG": "WARN",
+}
+PPU_ENVIRONS = {
+    "NCCL_DEBUG": "INFO",
+    "NCCL_IB_DISABLE": "1",
+    "NCCL_DEBUG_SUBSYS": "INIT",
+    "NCCL_SET_THREAD_NAME": "1",
+    "NCCL_IB_HCA": "",
+    "NCCL_SOCKET_IFNAME": "bond0",
+    "PCCL_STATE_MONITOR_DISABLE": "1",
+}
+NA132_ENVIRONS = {
+    "NCCL_SOCKET_IFNAME": "bond0",
+    "NCCL_NET_PLUGIN": "",
+    "NCCL_IB_GID_INDEX": "3",
+    "NCCL_IB_TIMEOUT": "2",
+    "NCCL_IB_RETRY_CNT": "7",
+    "NCCL_IB_SL": "5",
+    "NCCL_IB_TC": "136",
+    "NCCL_IB_HCA": "mlx5_bond",
+    "NCCL_IB_QPS_PER_CONNECTION": "8",
+    "NCCL_SET_THREAD_NAME": "1",
+    "NCCL_DEBUG_SUBSYS": "INIT,TUNING,GRAPH",
+}


 # _model_name will be changed in the model_scope context manager
@ -570,18 +543,21 @@ def get_repo_path() -> pathlib.Path:
    return pathlib.Path(__file__).resolve().parent.parent.parent


-def get_env_vars(**kwargs):
+def get_env_vars(exp_cfg: "BaseExperimentConfig", **kwargs):
    kwargs.update(
-        CLUSTER_SPEC_PATH=os.environ.get("CLUSTER_SPEC_PATH", ""),
        REAL_DUMP_TRACE=os.environ.get("REAL_DUMP_TRACE", "0"),
        REAL_RECORD_PERFORMANCE=os.environ.get("REAL_RECORD_PERFORMANCE", "0"),
        FUNCTIONCALL_SERVICE_DOMAIN=os.getenv("FUNCTIONCALL_SERVICE_DOMAIN", ""),
        REAL_DUMP_MEMORY=os.environ.get("REAL_DUMP_MEMORY", "0"),
-        REAL_ETCD_ADDR=os.getenv("REAL_ETCD_ADDR", "localhost:2379"),
        REAL_OSS_TESTCASE_PATH=os.getenv("REAL_OSS_TESTCASE_PATH", ""),
    )
-    return {
+    envvars = {
        **kwargs,
        "REAL_PACKAGE_PATH": str(get_repo_path()),
        **BASE_ENVIRONS,
    }
+    if exp_cfg.cluster.cluster_name == "wa180":
+        envvars.update(**PPU_ENVIRONS)
+    if exp_cfg.cluster.cluster_name == "na132":
+        envvars.update(**NA132_ENVIRONS)
+    return envvars
--- a/realhf/base/monitor.py
+++ b/realhf/base/monitor.py
@ -358,106 +358,6 @@ def calculate_llama_gen_flops(
    return flops


-#################### CUDA Kernel Time Marking Start ####################
-# Used to create timeline plots.
-
-
-class CUDATimeMarkType(enum.Enum):
-    forward = "forward"
-    backward = "backward"
-    optim_step = "optim_step"
-    comm = "comm"
-    misc = "misc"
-    mem_layout = "memory_layout"
-
-
-@dataclasses.dataclass
-class TimeMarkEntry:
-    name: str
-    model_name: "ModelName"
-    type_: CUDATimeMarkType
-    start_time: int
-    end_time: int
-
-
-TIME_MARK_DB = []
-
-
-def cuda_tmark(name: str, type_: CUDATimeMarkType):
-    if os.getenv("REAL_CUDA_TMARK", None) == "1":
-
-        def wrapper(f: Callable):
-
-            def _wrapped_f(*args, **kwargs):
-                import torch
-
-                if constants.use_cuda():
-                    from realhf.base.constants import _model_name
-
-                    torch.cuda.synchronize()
-                    tik = time.time_ns()
-                    res = f(*args, **kwargs)
-                    torch.cuda.synchronize()
-                    tok = time.time_ns()
-                    global TIME_MARK_DB
-                    TIME_MARK_DB.append(
-                        TimeMarkEntry(name, _model_name, type_, tik, tok)
-                    )
-                else:
-                    res = f(*args, **kwargs)
-                return res
-
-            return _wrapped_f
-
-    else:
-
-        def wrapper(f):
-            return f
-
-    return wrapper
-
-
-@contextlib.contextmanager
-def cuda_tmarked(name: str, type_: CUDATimeMarkType):
-    if os.getenv("REAL_CUDA_TMARK", None) == "1":
-        import torch
-
-        if constants.use_cuda():
-            from realhf.base.constants import _model_name
-
-            torch.cuda.synchronize()
-            tik = time.time_ns()
-    yield
-    if os.getenv("REAL_CUDA_TMARK", None) == "1":
-        if constants.use_cuda():
-            torch.cuda.synchronize()
-            tok = time.time_ns()
-            global TIME_MARK_DB
-            TIME_MARK_DB.append(TimeMarkEntry(name, _model_name, type_, tik, tok))
-
-
-def fetch_latest_tmark():
-    global TIME_MARK_DB
-    return TIME_MARK_DB[-1]
-
-
-def dump_tmark_db(worker_idx):
-    if os.getenv("REAL_CUDA_TMARK", None) != "1":
-        return
-    fn = os.path.join(
-        constants.LOG_ROOT,
-        constants.experiment_name(),
-        constants.trial_name(),
-        f"time_marks{worker_idx}.pkl",
-    )
-    global TIME_MARK_DB
-    with open(fn, "wb") as f:
-        pickle.dump(TIME_MARK_DB, f)
-    TIME_MARK_DB.clear()
-
-
-#################### CUDA Kernel Time Marking End ####################
-
 #################### CUDA Kernel Time Statistics Start ####################
 # Categorizing CUDA kernels into computation, communication, memory IO, and MISC/IDLE,
 # used to plot the percentage of time spent on each category and show how much we can
--- a/realhf/base/name_resolve.py
+++ b/realhf/base/name_resolve.py
@ -11,7 +11,7 @@ import shutil
 import threading
 import time
 import uuid
-from typing import Callable, List, Optional
+from typing import TYPE_CHECKING, Callable, List, Optional

 import ray

@ -22,6 +22,9 @@ except Exception:

 from realhf.base import logging, security, timeutil

+if TYPE_CHECKING:
+    from realhf.api.cli_args import NameResolveConfig
+
 logger = logging.getLogger("name-resolve")


@ -45,12 +48,6 @@ class NameRecordRepository:
        except Exception as e:
            logger.info(f"Exception ignore when deleting NameResolveRepo {e}")

-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.reset()
-
    def add(
        self,
        name,
@ -287,24 +284,20 @@ class MemoryNameRecordRepository(NameRecordRepository):


 class NfsNameRecordRepository(NameRecordRepository):
-    RECORD_ROOT = ""

-    def __init__(self, **kwargs):
+    def __init__(self, record_root="", **kwargs):
        self.__to_delete = set()
+        self.record_root = record_root

-    @staticmethod
-    def __dir_path(name):
-        if not NfsNameRecordRepository.RECORD_ROOT:
-            from realhf.base.cluster import spec as cluster_spec
+    def __dir_path(self, name):
+        if not self.record_root:
+            raise RuntimeError(
+                f"The `record_root` of NfsNameRecordRepository is not properly reconfigured."
+            )
+        return os.path.join(self.record_root, name)

-            RECORD_ROOT = f"{cluster_spec.fileroot}/name_resolve/"
-            os.makedirs(RECORD_ROOT, exist_ok=True)
-            NfsNameRecordRepository.RECORD_ROOT = RECORD_ROOT
-        return os.path.join(NfsNameRecordRepository.RECORD_ROOT, name)
-
-    @staticmethod
-    def __file_path(name):
-        return os.path.join(NfsNameRecordRepository.__dir_path(name), "ENTRY")
+    def __file_path(self, name):
+        return os.path.join(self.__dir_path(name), "ENTRY")

    def add(
        self,
@ -342,7 +335,7 @@ class NfsNameRecordRepository(NameRecordRepository):
        os.remove(path)
        while True:
            path = os.path.dirname(path)
-            if path == NfsNameRecordRepository.RECORD_ROOT:
+            if path == self.record_root:
                break
            if len(os.listdir(path)) > 0:
                break
@ -385,7 +378,7 @@ class NfsNameRecordRepository(NameRecordRepository):
                        continue
                    if files[0] != "ENTRY":
                        continue
-                    key = root.removeprefix(self.RECORD_ROOT)
+                    key = root.removeprefix(self.record_root)
                    key = key.removeprefix("/")
                    rs.append(self.get(key))
                except NameEntryNotFoundError:
@ -402,7 +395,7 @@ class NfsNameRecordRepository(NameRecordRepository):
                        continue
                    if files[0] != "ENTRY":
                        continue
-                    key = root.removeprefix(self.RECORD_ROOT)
+                    key = root.removeprefix(self.record_root)
                    key = key.removeprefix("/")
                    rs.append(key)
                except NameEntryNotFoundError:
@ -571,15 +564,6 @@ class Etcd3NameRecordRepository(NameRecordRepository):
    TTL-based expiration, atomic operations, and key watching functionality.
    """

-    # Default configuration
-    try:
-        host, port = os.getenv("REAL_ETCD_ADDR", "").split(":")
-    except ValueError:
-        host, port = "localhost", 2379
-    ETCD_HOST = host
-    ETCD_PORT = int(port)
-    ETCD_USER = None
-    ETCD_PASSWORD = None
    KEEPALIVE_POLL_FREQUENCY = 1

    @dataclasses.dataclass
@ -604,14 +588,17 @@ class Etcd3NameRecordRepository(NameRecordRepository):
        self._lock = threading.Lock()

        # Set connection parameters
-        self._host = host or self.ETCD_HOST
-        self._port = port or self.ETCD_PORT
-        self._user = user or self.ETCD_USER
-        self._password = password or self.ETCD_PASSWORD
+        self._host = host
+        self._port = port
+        self._user = user
+        self._password = password

        # Connect to etcd
        self._client = etcd3.client(
-            host=self._host, port=self._port, user=self._user, password=self._password
+            host=self._host,
+            port=self._port,
+            user=self._user,
+            password=self._password,
        )

        # Keep track of entries for cleanup and keepalive
@ -835,16 +822,17 @@ class Etcd3NameRecordRepository(NameRecordRepository):
    def reset(self):
        """Delete all keys added via this repository instance."""
        with self._lock:
-            count = 0
-            for name in self._to_delete:
-                if name in self._entries:
-                    try:
-                        self._delete_locked(name)
-                        count += 1
-                    except NameEntryNotFoundError:
-                        pass
-            self._to_delete = set()
-            logger.info(f"Reset {count} saved etcd entries")
+            if hasattr(self, "_to_delete"):
+                count = 0
+                for name in self._to_delete:
+                    if name in self._entries:
+                        try:
+                            self._delete_locked(name)
+                            count += 1
+                        except NameEntryNotFoundError:
+                            pass
+                self._to_delete = set()
+                logger.info(f"Reset {count} saved etcd entries")

    def _keepalive_thread_run(self):
        """Background thread to keep leases alive."""
@ -1099,12 +1087,6 @@ class RayNameResolveRepository:
                f"Exception ignored when deleting RayNameResolveRepository: {e}"
            )

-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.reset()
-
    def add(
        self,
        name: str,
@ -1376,31 +1358,19 @@ class RayNameResolveRepository:
                            )


-def make_repository(type_="nfs", **kwargs):
-    if type_ == "memory":
-        return MemoryNameRecordRepository(**kwargs)
-    elif type_ == "nfs":
-        return NfsNameRecordRepository(**kwargs)
-    elif type_ == "redis":
-        return RedisNameRecordRepository(**kwargs)
-    elif type_ == "etcd3":
-        return Etcd3NameRecordRepository(**kwargs)
-    elif type_ == "ray":
-        return RayNameResolveRepository(**kwargs)
+def make_repository(args: "NameResolveConfig"):
+    if args.type == "nfs":
+        return NfsNameRecordRepository(args.nfs_record_root)
+    elif args.type == "etcd3":
+        host, port = args.etcd3_addr.split(":")
+        return Etcd3NameRecordRepository(host=host, port=int(port))
+    elif args.type == "ray":
+        return RayNameResolveRepository(actor_name=args.ray_actor_name)
    else:
-        raise NotImplementedError(f"No such name resolver: {type_}")
+        raise NotImplementedError(f"No such name resolver: {args.type}")


-# DEFAULT_REPOSITORY_TYPE = "redis" if socket.gethostname().startswith("frl") else "nfs"
-DEFAULT_REPOSITORY_TYPE = "nfs"
-if etcd3 is not None and os.getenv("REAL_ETCD_ADDR", ""):
-    DEFAULT_REPOSITORY_TYPE = "etcd3"
-if os.getenv("REAL_ETCD_ADDR", "") and etcd3 is None:
-    logger.warning(
-        f"Detected REAL_ETCD_ADDR but etcd3 client is not available. "
-        "Please run `pip install -r requirements.txt` if you want to use etcd name resolve."
-    )
-DEFAULT_REPOSITORY = make_repository(DEFAULT_REPOSITORY_TYPE)
+DEFAULT_REPOSITORY = NfsNameRecordRepository()
 add = DEFAULT_REPOSITORY.add
 add_subentry = DEFAULT_REPOSITORY.add_subentry
 delete = DEFAULT_REPOSITORY.delete
@ -1413,11 +1383,10 @@ reset = DEFAULT_REPOSITORY.reset
 watch_names = DEFAULT_REPOSITORY.watch_names


-def reconfigure(*args, **kwargs):
-    global DEFAULT_REPOSITORY, DEFAULT_REPOSITORY_TYPE
+def reconfigure(config: "NameResolveConfig"):
+    global DEFAULT_REPOSITORY
    global add, add_subentry, delete, clear_subtree, get, get_subtree, find_subtree, wait, reset, watch_names
-    DEFAULT_REPOSITORY = make_repository(*args, **kwargs)
-    DEFAULT_REPOSITORY_TYPE = args[0]
+    DEFAULT_REPOSITORY = make_repository(config)
    add = DEFAULT_REPOSITORY.add
    add_subentry = DEFAULT_REPOSITORY.add_subentry
    delete = DEFAULT_REPOSITORY.delete
--- a/realhf/base/network.py
+++ b/realhf/base/network.py
@ -23,14 +23,20 @@ def gethostip():


 def find_free_port(
-    low=1, high=65536, exclude_ports=None, experiment_name="port", trial_name="port"
+    low=1,
+    high=65536,
+    exclude_ports=None,
+    experiment_name="port",
+    trial_name="port",
+    lockfile_root=constants.PORT_LOCKFILE_ROOT,
 ):
    """Find a free port within the specified range, excluding certain ports."""

    ports_name = names.used_ports(experiment_name, trial_name, gethostip())

    free_port = None
-    lockfile = os.path.join(constants.PORT_LOCK_FILE_ROOT, gethostip())
+    os.makedirs(lockfile_root, exist_ok=True)
+    lockfile = os.path.join(lockfile_root, gethostip())
    while True:
        with open(lockfile, "w") as fd:
            # This will block until lock is acquired
@ -58,7 +64,12 @@ def find_free_port(


 def find_multiple_free_ports(
-    count, low=1, high=65536, experiment_name="port", trial_name="port"
+    count,
+    low=1,
+    high=65536,
+    experiment_name="port",
+    trial_name="port",
+    lockfile_root=constants.PORT_LOCKFILE_ROOT,
 ):
    """Find multiple mutually exclusive free ports."""
    free_ports = set()
@ -69,6 +80,7 @@ def find_multiple_free_ports(
            exclude_ports=free_ports,
            experiment_name=experiment_name,
            trial_name=trial_name,
+            lockfile_root=lockfile_root,
        )
        free_ports.add(port)
    return list(free_ports)
--- a/realhf/base/prologue.py
+++ b/realhf/base/prologue.py
@ -35,26 +35,6 @@ def global_init():
            if key not in os.environ:
                os.environ[key] = value

-    # resolve config path for cluster spec.
-    cluster_spec_path = os.environ.get("CLUSTER_SPEC_PATH", "")
-    if cluster_spec_path == "":
-        if external_configs.get("cluster_config"):
-            fileroot = external_configs.cluster_config.get("fileroot")
-            if fileroot is not None and fileroot != "":
-                experiment_name = get_experiment_name(config.get("experiment_name"))
-                trial_name = get_trial_name(config.get("trial_name"))
-                config_dir = f"{fileroot}/configs/{getpass.getuser()}/{experiment_name}/{trial_name}"
-                os.makedirs(config_dir, exist_ok=True)
-                cluster_spec_path = f"{config_dir}/cluster_config.json"
-                cluster_spec = OmegaConf.to_container(external_configs.cluster_config)
-                if "cluster_type" not in cluster_spec:
-                    cluster_spec["cluster_type"] = config.mode
-                if "cluster_name" not in cluster_spec:
-                    cluster_spec["cluster_name"] = f"{config.mode}_cluster"
-                with open(cluster_spec_path, "w") as f:
-                    json.dump(cluster_spec, f)
-                os.environ["CLUSTER_SPEC_PATH"] = cluster_spec_path
-

 def get_experiment_name(default_name: str = ""):
    if any("experiment_name=" in x for x in sys.argv):
--- a/realhf/base/recover.py
+++ b/realhf/base/recover.py
@ -40,13 +40,11 @@ class RecoverInfo:
    hash_vals_to_ignore: List[int] = dataclasses.field(default_factory=list)


-def dump_recover_info(recover_info: RecoverInfo):
+def dump_recover_info(args, recover_info: RecoverInfo):
    global RECOVER_INFO_PATH
    if RECOVER_INFO_PATH is None:
        RECOVER_INFO_PATH = os.path.join(
-            constants.RECOVER_ROOT,
-            constants.experiment_name(),
-            constants.trial_name(),
+            constants.get_save_path(args),
            "recover_info.pkl",
        )
        os.makedirs(os.path.dirname(RECOVER_INFO_PATH), exist_ok=True)
@ -54,15 +52,13 @@ def dump_recover_info(recover_info: RecoverInfo):
        pickle.dump(recover_info, f)


-def load_recover_info() -> Tuple[int, Optional[RecoverInfo]]:
+def load_recover_info(args) -> Tuple[int, Optional[RecoverInfo]]:
    if os.environ.get("REAL_RECOVER_RUN", "0") != "1":
        return False, None
    global RECOVER_INFO_PATH
    if RECOVER_INFO_PATH is None:
        RECOVER_INFO_PATH = os.path.join(
-            constants.RECOVER_ROOT,
-            constants.experiment_name(),
-            constants.trial_name(),
+            constants.get_save_path(args),
            "recover_info.pkl",
        )
        os.makedirs(os.path.dirname(RECOVER_INFO_PATH), exist_ok=True)
@ -81,15 +77,9 @@ class InValidRecoverCkpt(Exception):
    pass


-def discover_ckpt(
-    expr_name: str, trial_name: str
-) -> Tuple[str, List[str], RecoverInfo]:
-    recover_info_file = (
-        pathlib.Path(constants.RECOVER_ROOT)
-        / expr_name
-        / trial_name
-        / "recover_info.pkl"
-    )
+def discover_ckpt(args) -> Tuple[str, List[str], RecoverInfo]:
+    expr_name, trial_name = args.experiment_name, args.trial_name
+    recover_info_file = pathlib.Path(constants.get_save_path(args)) / "recover_info.pkl"
    if os.path.exists(str(recover_info_file)):
        with open(recover_info_file, "rb") as f:
            info: RecoverInfo = pickle.load(f)
@ -100,9 +90,7 @@ def discover_ckpt(
                f"but found {info.last_step_info.epoch}"
            )
            raise InValidRecoverCkpt(msg)
-        model_save_dir = (
-            pathlib.Path(constants.MODEL_SAVE_ROOT) / expr_name / trial_name
-        )
+        model_save_dir = pathlib.Path(constants.get_save_path(args))
        model_ckpt_dirs = []
        for role in os.listdir(model_save_dir):
            if "dataset_indices" in role:
--- a/realhf/base/slurm_utils.py
+++ b/realhf/base/slurm_utils.py
@ -9,19 +9,17 @@ from typing import List

 import numpy as np

-from realhf.base.cluster import spec as cluster_spec
-

 def parse_node_id(node_name: str, prefix: str) -> int:
    return int(node_name.split(prefix)[-1])


-def parse_nodelist(nodelist: str, prefix: str) -> List[str]:
+def parse_nodelist(cluster_config, nodelist: str, prefix: str) -> List[str]:
    if not nodelist.startswith(prefix):
        raise ValueError(
            f"Node list `{nodelist}` does not start with hostname prefix `{prefix}`."
        )
-    n = cluster_spec.suffix_n_digits
+    n = len(str(cluster_config.n_nodes))
    nodelist = nodelist.replace(prefix, "")
    if "[" not in nodelist:
        return [prefix + nodelist]
@ -38,30 +36,6 @@ def parse_nodelist(nodelist: str, prefix: str) -> List[str]:
        return [f"{prefix}{node_id:0{n}d}" for node_id in node_ids]


-def nodelist_from_nodes(nodes: List[str], prefix: str) -> str:
-    n = cluster_spec.suffix_n_digits
-    node_ids = sorted([parse_node_id(node, prefix) for node in nodes])
-    assert len(node_ids) > 0
-    if len(node_ids) == 1:
-        return f"{prefix}{node_ids[0]:02d}"
-    else:
-        node_reprs = []
-        start, end = node_ids[0], node_ids[0]
-        for i in range(len(node_ids)):
-            node_id = node_ids[i]
-            next_node_id = node_ids[i + 1] if i + 1 < len(node_ids) else -1
-            if node_id + 1 == next_node_id:
-                end = next_node_id
-            else:
-                if start == end:
-                    node_reprs.append(f"{start:0{n}d}")
-                else:
-                    node_reprs.append(f"{start:0{n}d}-{end:0{n}d}")
-                start = next_node_id
-                end = next_node_id
-        return f"{prefix}[{','.join(node_reprs)}]"
-
-
 def are_ones_contiguous(binary_array: np.ndarray):
    one_indices = np.where(binary_array == 1)[0]
    if len(one_indices) == 0:
--- a/realhf/base/testing.py
+++ b/realhf/base/testing.py
@ -19,6 +19,7 @@ import torch
 import torch.distributed as dist
 import torch.utils.data

+from realhf.api.cli_args import BaseExperimentConfig, NameResolveConfig
 from realhf.api.core.data_api import SequenceSample
 from realhf.base import constants, gpu_utils, logging, name_resolve, names, topology
 from realhf.base.topology import (
@ -92,6 +93,14 @@ class StandaloneTestingProcess(mp.Process):
        if constants.use_cuda():
            torch.cuda.set_device(0)

+        from realhf.api.cli_args import NameResolveConfig
+
+        name_resolve.reconfigure(
+            NameResolveConfig(
+                "nfs", f"/tmp/areal/testing/{self.expr_name}/{self.trial_name}/"
+            )
+        )
+
        self.barrier.wait()

        if self.setup_dist_torch:
@ -103,7 +112,11 @@ class StandaloneTestingProcess(mp.Process):
            if self.dist_backend is None:
                self.dist_backend = "gloo" if not constants.use_cuda() else "nccl"
            setup_global_comm(
-                self.expr_name, self.trial_name, self.rank, backend=self.dist_backend
+                BaseExperimentConfig(),
+                self.expr_name,
+                self.trial_name,
+                self.rank,
+                backend=self.dist_backend,
            )

        # misc setup
@ -149,6 +162,11 @@ class LocalMultiProcessTest:
        if torch.cuda.is_available():
            os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
            os.environ["GPU_DEVICES_ISOLATED"] = str(1)
+        expr_name = expr_name if expr_name is not None else _DEFAULT_EXPR_NAME
+        trial_name = trial_name if trial_name is not None else _DEFAULT_TRIAL_NAME
+        name_resolve.reconfigure(
+            NameResolveConfig("nfs", f"/tmp/areal/testing/{expr_name}/{trial_name}/")
+        )
        clear_name_resolve(expr_name, trial_name)
        self.timeout_secs = timeout_secs
        self.processes = []
--- a/realhf/base/topology.py
+++ b/realhf/base/topology.py
@ -21,7 +21,6 @@ from typing import Dict, List, NamedTuple, Optional, Tuple
 import torch.distributed as dist

 import realhf.base.logging as logging
-from realhf.base.cluster import spec as cluster_spec
 from realhf.base.constants import NCCL_DEFAULT_TIMEOUT

 logger = logging.getLogger("Topology")
@ -185,7 +184,7 @@ class ProcessTopology:
        omit_axes = frozenset(omit_axes)
        axes = [a for a in self.get_axis_names() if a not in omit_axes]
        names = []
-        n = cluster_spec.suffix_n_digits
+        n = len(str(len(self.mapping)))
        for ax in axes:
            ax_rank = getattr(self.get_coord(rank=rank), ax)
            names.append(f"{ax}{inner_sep}{ax_rank:0{n}d}")
--- a/realhf/experiments/async_exp/async_ppo_math_exp.py
+++ b/realhf/experiments/async_exp/async_ppo_math_exp.py
@ -2,6 +2,7 @@

 import copy
 import dataclasses
+import os
 from typing import Any, Dict, List, Tuple

 import realhf.base.logging as logging
@ -13,6 +14,7 @@ from realhf.api.core.config import (
 )
 from realhf.api.core.model_api import GenerationHyperparameters
 from realhf.api.quickstart.entrypoint import register_quickstart_exp
+from realhf.base import constants
 from realhf.experiments.async_exp.async_rl_exp import AsyncRLExperimentConfig
 from realhf.experiments.common.ppo_math_exp import PPOMATHConfig
 from realhf.experiments.common.utils import asdict
@ -29,6 +31,9 @@ class AsyncPPOMATHConfig(AsyncRLExperimentConfig, PPOMATHConfig):
            "math-single-step",
            args=dict(
                gconfig=self.generation_config,
+                answer_save_path=os.path.join(
+                    constants.get_log_path(self), "generated"
+                ),
                tokenizer_path=self.actor.path,
                success_rate_lb=self.success_rate_lb,
                success_rate_ub=self.success_rate_ub,
@ -40,7 +45,10 @@ class AsyncPPOMATHConfig(AsyncRLExperimentConfig, PPOMATHConfig):
    @property
    def env(self) -> EnvServiceAbstraction:
        return EnvServiceAbstraction(
-            "math-code-single-step", args=dict(dataset_path=self.dataset.path)
+            "math-code-single-step",
+            args=dict(
+                dataset_path=self.dataset.path,
+            ),
        )

    @property
--- a/realhf/experiments/async_exp/async_rl_exp.py
+++ b/realhf/experiments/async_exp/async_rl_exp.py
@ -38,8 +38,6 @@ from realhf.api.core.system_api import (
    TasksGroup,
 )
 from realhf.api.quickstart.device_mesh import RPCAllocation
-from realhf.base.cluster import spec as cluster_spec
-from realhf.experiments.common.check import check_valid_sglang, check_valid_vllm
 from realhf.experiments.common.common import CommonExperimentConfig
 from realhf.experiments.common.utils import (
    AllocationMode,
@ -92,49 +90,57 @@ class AsyncRLExperimentConfig(CommonExperimentConfig, AsyncRLOptions):
        return ExperimentScheduling(
            master_worker=TasksGroup(
                count=1,
-                scheduling=Scheduling.master_worker_default(
+                scheduling=Scheduling(
                    cpu=self.cpus_per_master_worker,
+                    gpu=0,
                    mem=self.mem_per_master_worker,
                    nodelist=self.nodelist,
                    exclude=self.exclude,
+                    container_image=self.cluster.cpu_image,
                ),
            ),
            model_worker=TasksGroup(
                count=train_world_size,
-                scheduling=Scheduling.model_worker_default(
+                scheduling=Scheduling(
                    cpu=self.cpus_per_model_worker,
                    gpu=1,
                    mem=self.mem_per_model_worker,
                    nodelist=self.nodelist,
                    exclude=self.exclude,
+                    container_image=self.cluster.gpu_image,
                ),
            ),
            generation_server=TasksGroup(
                count=gen_world_size // gen_tp_size,
-                scheduling=Scheduling.generation_server_default(
+                scheduling=Scheduling(
                    cpu=self.cpus_per_generation_server,
                    gpu=gen_tp_size,
                    mem=self.mem_per_generation_server,
                    nodelist=self.nodelist,
                    exclude=self.exclude,
+                    container_image=self.cluster.gpu_infer_image,
                ),
            ),
            gserver_manager=TasksGroup(
                count=1,
-                scheduling=Scheduling.gserver_manager_default(
+                scheduling=Scheduling(
                    cpu=self.cpus_per_gserver_manager,
+                    gpu=0,
                    mem=self.mem_per_gserver_manager,
                    nodelist=self.nodelist,
                    exclude=self.exclude,
+                    container_image=self.cluster.cpu_image,
                ),
            ),
            rollout_worker=TasksGroup(
                count=self.n_rollout_workers or train_world_size,
-                scheduling=Scheduling.rollout_worker_default(
+                scheduling=Scheduling(
                    cpu=self.cpus_per_rollout_worker,
+                    gpu=0,
                    mem=self.mem_per_rollout_worker,
                    nodelist=self.nodelist,
                    exclude=self.exclude,
+                    container_image=self.cluster.cpu_image,
                ),
            ),
        )
@ -162,7 +168,11 @@ class AsyncRLExperimentConfig(CommonExperimentConfig, AsyncRLOptions):
                # NOTE: here we use puller stream to wrap the original dataset
                datasets=[
                    DatasetAbstraction(
-                        "puller_stream", args=dict(dataset_cfgs=self.datasets)
+                        "puller_stream",
+                        args=dict(
+                            dataset_cfgs=self.datasets,
+                            args=self,
+                        ),
                    )
                ],
                torch_cache_mysophobia=self.torch_cache_mysophobia,
--- a/realhf/experiments/benchmark/profile_exp.py
+++ b/realhf/experiments/benchmark/profile_exp.py
@ -1,259 +0,0 @@
-# Copyright 2025 Ant Group Inc.
-# Copyright 2024 Wei Fu & Zhiyu Mei
-# Licensed under the Apache License, Version 2.0 (the "License").
-
-import copy
-import dataclasses
-import itertools
-import json
-import os
-from typing import *
-
-from omegaconf import OmegaConf
-
-from realhf.api.cli_args import (
-    MFCConfig,
-    ModelTrainEvalConfig,
-    ParallelismConfig,
-    PromptOnlyDatasetConfig,
-)
-from realhf.api.core.config import (
-    DatasetAbstraction,
-    ModelInterfaceAbstraction,
-    ModelInterfaceType,
-)
-from realhf.api.core.dfg import MFCDef
-from realhf.api.core.system_api import ExperimentConfig
-from realhf.api.quickstart.entrypoint import register_quickstart_exp
-from realhf.base import constants, logging
-from realhf.base.topology import decompose_to_three_factors
-from realhf.experiments.common.common import CommonExperimentConfig
-
-logger = logging.getLogger("Profiling Experiment", "system")
-
-
-def default_parallel_config(n_gpus: int) -> List[Dict[str, Any]]:
-    factors = decompose_to_three_factors(n_gpus)
-    x = [
-        {
-            "data_parallel_size": dp,
-            "tensor_parallel_size": tp,
-            "pipeline_parallel_size": pp,
-            "use_sequence_parallel": tp > 1,
-        }
-        for dp, tp, pp in factors
-    ]
-    x += [
-        {
-            "data_parallel_size": dp,
-            "tensor_parallel_size": tp,
-            "pipeline_parallel_size": pp,
-            "use_sequence_parallel": False,
-        }
-        for dp, tp, pp in factors
-        if tp > 1
-    ]
-    return x
-
-
-def dataclass_from_dict(klass, d):
-    try:
-        fieldtypes = {f.name: f.type for f in dataclasses.fields(klass)}
-        return klass(**{f: dataclass_from_dict(fieldtypes[f], d[f]) for f in d})
-    except:
-        return d  # Not a dataclass field
-
-
-@dataclasses.dataclass
-class ProfileConfig(CommonExperimentConfig):
-    """The experiment configuration for profiling layers and interfaces.
-
-    The `initial_setup` method in this experiment will return a list of
-    experiment configurations, which will be run sequentially.
-    All configurations share the same experiment name, trial name,
-    and the scheduling configuration. They can have different models,
-    datasets, or parallel strategies, as long as they always occupy
-    a fixed number of GPUs.
-
-    It's important to note that, if any error occurs during the execution,
-    the experiment will terminate immediately. In particular, the OOM error
-    should not appear because the profiling setup usually uses a small model.
-    """
-
-    interfaces_jsonl: str = ""
-    allocations_jsonl: Optional[str] = None
-    handle_names: Optional[List[str]] = None
-    n_mbs: Optional[List[int]] = None
-    batch_sizes: Optional[List[int]] = None
-    models_jsonl: str = ""
-    datasets_jsonl: str = ""
-
-    def __post_init__(self):
-        # Check that handle_name belones to ["train_step", "generate", "inference"]
-        self.handle_names = list(set(self.handle_names))
-        if any(
-            k not in ["train_step", "generate", "inference"] for k in self.handle_names
-        ):
-            raise NotImplementedError(f"Unknown handle_name: {self.handle_name}")
-
-        # Check the configuration of interfaces
-        if not os.path.exists(self.interfaces_jsonl):
-            raise FileNotFoundError(
-                f"File not found: {self.interfaces_jsonl}. "
-                "It should be a JSONL file specifying the arguments "
-                "for the interface implementation."
-            )
-        with open(self.interfaces_jsonl, "r") as f:
-            self.interface_kwargs = [json.loads(l) for l in f.readlines()]
-
-        # Check the configuration of parallel strategies.
-        if self.allocations_jsonl is None:
-            self.parallel_kwargs = default_parallel_config(
-                self.n_nodes * self.n_gpus_per_node
-            )
-        else:
-            assert self.allocations_jsonl.endswith(".jsonl")
-            assert os.path.exists(self.allocations_jsonl)
-            with open(self.allocations_jsonl, "r") as f:
-                self.parallel_kwargs = [json.loads(l) for l in f.readlines()]
-        for pcfg in self.parallel_kwargs:
-            assert isinstance(pcfg, dict), type(pcfg)
-            assert all(
-                k
-                in [
-                    "data_parallel_size",
-                    "tensor_parallel_size",
-                    "pipeline_parallel_size",
-                    "use_sequence_parallel",
-                ]
-                for k in pcfg.keys()
-            ), pcfg.keys()
-            assert (self.n_nodes * self.n_gpus_per_node) == (
-                pcfg.get("data_parallel_size", 1)
-                * pcfg.get("tensor_parallel_size", 1)
-                * pcfg.get("pipeline_parallel_size", 1)
-            )
-
-        if self.n_mbs is None:
-            self.n_mbs = [1]
-        else:
-            self.n_mbs = OmegaConf.to_container(self.n_mbs)
-            assert isinstance(self.n_mbs, list), type(self.n_mbs)
-            assert all(isinstance(x, int) for x in self.n_mbs)
-
-        assert self.batch_sizes is not None
-
-        assert os.path.exists(self.models_jsonl)
-        with open(self.models_jsonl, "r") as f:
-            self.model_kwargs = [json.loads(l) for l in f.readlines()]
-
-        assert os.path.exists(self.datasets_jsonl)
-        with open(self.datasets_jsonl, "r") as f:
-            self.dataset_kwargs = [json.loads(l) for l in f.readlines()]
-            assert all(x["type_"] == "prompt" for x in self.dataset_kwargs)
-
-    @property
-    def allocations(self):
-        return dict(default=self._tmp_allocation)
-
-    @property
-    def models(self):
-        return dict(default=self._tmp_model)
-
-    @property
-    def tokenizer_name_or_path(self):
-        return self._tmp_model.path
-
-    @property
-    def max_prompt_len(self):
-        return self._tmp_dataset.args["max_length"]
-
-    @property
-    def datasets(self):
-        return [self._tmp_dataset]
-
-    @property
-    def rpcs(self):
-        return dict(default=self._tmp_rpc)
-
-    def initial_setup(self) -> List[ExperimentConfig]:
-        self.allocation_mode = "manual"
-        setups = []
-        setup_log_path = os.path.join(
-            constants.LOG_ROOT,
-            self.experiment_name,
-            self.trial_name,
-            "setups.jsonl",
-        )
-        logger.info(
-            f"Experiment setup configurations of the profiling experiment "
-            f"will be saved to: {setup_log_path}"
-        )
-        with open(setup_log_path, "w") as f:
-            # batch size in the most outer loop to delay the possible OOM error
-            for (
-                bs,
-                pcfg,
-                n_mbs,
-                model_cfg,
-                dataset_cfg,
-                handle_name,
-                interface_cfg,
-            ) in itertools.product(
-                self.batch_sizes,
-                self.parallel_kwargs,
-                self.n_mbs,
-                self.model_kwargs,
-                self.dataset_kwargs,
-                self.handle_names,
-                self.interface_kwargs,
-            ):
-                if handle_name == "generate" and pcfg["use_sequence_parallel"]:
-                    continue
-
-                kwargs_stat = dict(
-                    parallel=pcfg,
-                    n_mbs=n_mbs,
-                    model=model_cfg,
-                    dataset=dataset_cfg,
-                    interface=interface_cfg,
-                    bs=bs,
-                )
-                f.write(json.dumps(kwargs_stat) + "\n")
-
-                # Create tmp object for constructing experiment setups
-                self._tmp_allocation = MFCConfig(
-                    parallel=ParallelismConfig(**pcfg), n_mbs=n_mbs
-                )
-                self._tmp_model = dataclass_from_dict(ModelTrainEvalConfig, model_cfg)
-                self._tmp_dataset = DatasetAbstraction(**dataset_cfg)
-                if handle_name == "train_step":
-                    interface_type = ModelInterfaceType.TRAIN_STEP
-                elif handle_name == "inference":
-                    interface_type = ModelInterfaceType.INFERENCE
-                elif handle_name == "generate":
-                    interface_type = ModelInterfaceType.GENERATE
-                else:
-                    raise NotImplementedError(
-                        f"Unknown which handle to run in the interface: {self.handle_name}"
-                    )
-                self._tmp_rpc = MFCDef(
-                    n_seqs=bs,
-                    name="default",
-                    n_mbs=n_mbs,
-                    interface_type=interface_type,
-                    interface_impl=ModelInterfaceAbstraction(**interface_cfg),
-                    model_name="default",
-                    input_keys=["packed_prompts"],
-                    log_return_value=False,
-                    balanced_dp=True,
-                )
-
-                setup = copy.deepcopy(super().initial_setup())
-                for m in setup.model_worker:
-                    m.profile_mode = True
-                setups.append(setup)
-        return setups
-
-
-register_quickstart_exp("profile", ProfileConfig)
--- a/realhf/experiments/common/common.py
+++ b/realhf/experiments/common/common.py
@ -42,7 +42,6 @@ from realhf.api.quickstart.device_mesh import (
    RPCAllocation,
    make_device_mesh_from_name,
 )
-from realhf.base.cluster import spec as cluster_spec
 from realhf.experiments.common.check import (
    check_valid_model_and_path,
    check_valid_optimizer,
@ -164,21 +163,24 @@ class CommonExperimentConfig(BaseExperimentConfig, Experiment):
        return ExperimentScheduling(
            master_worker=TasksGroup(
                count=1,
-                scheduling=Scheduling.master_worker_default(
+                scheduling=Scheduling(
                    cpu=self.cpus_per_master_worker,
+                    gpu=0,
                    mem=self.mem_per_master_worker,
                    nodelist=self.nodelist,
                    exclude=self.exclude,
+                    container_image=self.cluster.cpu_image,
                ),
            ),
            model_worker=TasksGroup(
                count=self.n_nodes * self.n_gpus_per_node,
-                scheduling=Scheduling.model_worker_default(
+                scheduling=Scheduling(
                    cpu=self.cpus_per_model_worker,
                    gpu=1,
                    mem=self.mem_per_model_worker,
                    nodelist=self.nodelist,
                    exclude=self.exclude,
+                    container_image=self.cluster.gpu_image,
                ),
            ),
        )
@ -326,6 +328,7 @@ class CommonExperimentConfig(BaseExperimentConfig, Experiment):
                    rpc=rpc,
                    device_mesh=(
                        make_device_mesh_from_name(
+                            self.cluster,
                            self.nodelist,
                            self.allocations[rpc_type].device_mesh,
                            self.global_device_mesh.n_gpus_per_node,
@ -579,7 +582,7 @@ class CommonExperimentConfig(BaseExperimentConfig, Experiment):
            )
        if self.n_gpus_per_node > self.cluster.n_gpus_per_node:
            raise ValueError(
-                f"Number of 7used GPUs per node {self.n_gpus_per_node} should not be larger than the cluster limit {self.cluster.n_gpus_per_node}"
+                f"Number of used GPUs per node {self.n_gpus_per_node} should not be larger than the cluster limit {self.cluster.n_gpus_per_node}"
            )
        if self.n_nodes > 1 and self.n_gpus_per_node != self.cluster.n_gpus_per_node:
            raise ValueError(
--- a/realhf/experiments/common/math_code_eval_exp.py
+++ b/realhf/experiments/common/math_code_eval_exp.py
@ -2,9 +2,9 @@
 # Copyright 2024 Wei Fu & Zhiyu Mei
 # Licensed under the Apache License, Version 2.0 (the "License").
 import dataclasses
+import os
 from typing import Dict

-import realhf.base.logging as logging
 from realhf.api.cli_args import MathCodeEvalOptions, ModelTrainEvalConfig
 from realhf.api.core.config import (
    DatasetAbstraction,
@ -13,6 +13,7 @@ from realhf.api.core.config import (
 )
 from realhf.api.core.dfg import MFCDef
 from realhf.api.quickstart.entrypoint import register_quickstart_exp
+from realhf.base import constants, logging
 from realhf.experiments.common.common import CommonExperimentConfig
 from realhf.experiments.common.utils import asdict

@ -55,6 +56,9 @@ class MathCodeEvalConfig(MathCodeEvalOptions, CommonExperimentConfig):
                dataset_path=self.dataset.path,
                tokenizer_path=self.actor.path,
                rw_type=self.rw_type,
+                answer_save_path=os.path.join(
+                    constants.get_log_path(self), "generated"
+                ),
                check_xml_format=self.check_xml_format,
                group_size=self.group_size,
                check_verifier_status=self.check_verifier_status,
--- a/realhf/experiments/common/ppo_math_exp.py
+++ b/realhf/experiments/common/ppo_math_exp.py
@ -6,7 +6,6 @@ import dataclasses
 import os
 from typing import Dict

-import realhf.base.logging as logging
 from realhf.api.cli_args import ModelTrainEvalConfig, PPOMATHExperimentOptions
 from realhf.api.core.config import (
    DatasetAbstraction,
@ -16,6 +15,7 @@ from realhf.api.core.config import (
 from realhf.api.core.dfg import MFCDef, ParamReallocHook
 from realhf.api.core.system_api import ExperimentConfig
 from realhf.api.quickstart.entrypoint import register_quickstart_exp
+from realhf.base import constants, logging
 from realhf.experiments.common.common import CommonExperimentConfig
 from realhf.experiments.common.utils import (
    asdict,
@ -132,6 +132,9 @@ class PPOMATHConfig(CommonExperimentConfig, PPOMATHExperimentOptions):
                check_xml_format=self.check_xml_format,
                group_size=self.group_size,
                check_verifier_status=self.check_verifier_status,
+                answer_save_path=os.path.join(
+                    constants.get_log_path(self), "generated"
+                ),
            ),
        )

--- a/realhf/impl/agent/math_multi_turn_agent.py
+++ b/realhf/impl/agent/math_multi_turn_agent.py
@ -32,6 +32,7 @@ class MathMultiTurnAgent(Agent):
        self,
        gconfig,
        tokenizer_path,
+        answer_save_path,
        reward_scaling=1.0,
        reward_bias=0.0,
        turn_level_discount: float = 1.0,
@ -39,6 +40,7 @@ class MathMultiTurnAgent(Agent):
    ):
        self.gconfig = gconfig.new(n=1)
        self.tokenizer = load_hf_tokenizer(tokenizer_path)
+        self.answer_save_path = answer_save_path

        self.reward_scaling = reward_scaling
        self.reward_bias = reward_bias
@ -245,10 +247,7 @@ class MathMultiTurnAgent(Agent):
        for group_idx in range(group_size):
            # NOTE: we can ensure that only one process is logging this query id
            gen_file_path = os.path.join(
-                constants.LOG_ROOT,
-                constants.experiment_name(),
-                constants.trial_name(),
-                "generated",
+                self.answer_save_path,
                str(version_starts[group_idx]),
                f"{qid}.txt",
            )
@ -271,10 +270,7 @@ class MathMultiTurnAgent(Agent):
                _f.write(info + "\n")

            train_pass_monitor_file_path = os.path.join(
-                constants.LOG_ROOT,
-                constants.experiment_name(),
-                constants.trial_name(),
-                "training_monitor",
+                self.answer_save_path,
                str(version_starts[group_idx]),
                f"{qid}.jsonl",
            )
--- a/realhf/impl/agent/math_single_step_agent.py
+++ b/realhf/impl/agent/math_single_step_agent.py
@ -25,6 +25,7 @@ class MathSingleStepAgent(Agent):
        self,
        gconfig,
        tokenizer_path,
+        answer_save_path,
        success_rate_lb,
        success_rate_ub,
        reward_scaling=1.0,
@ -32,6 +33,7 @@ class MathSingleStepAgent(Agent):
    ):
        self.gconfig = gconfig
        self.tokenizer = load_hf_tokenizer(tokenizer_path)
+        self.answer_save_path = answer_save_path

        self.success_rate_lb = success_rate_lb
        self.success_rate_ub = success_rate_ub
@ -198,10 +200,7 @@ class MathSingleStepAgent(Agent):
        for group_idx in range(group_size):
            # NOTE: we can ensure that only one process is logging this query id
            gen_file_path = os.path.join(
-                constants.LOG_ROOT,
-                constants.experiment_name(),
-                constants.trial_name(),
-                "generated",
+                self.answer_save_path,
                str(version_starts[group_idx]),
                f"{qid}.txt",
            )
@ -224,10 +223,7 @@ class MathSingleStepAgent(Agent):
                _f.write(info + "\n")

            train_pass_monitor_file_path = os.path.join(
-                constants.LOG_ROOT,
-                constants.experiment_name(),
-                constants.trial_name(),
-                "training_monitor",
+                self.answer_save_path,
                str(version_starts[group_idx]),
                f"{qid}.jsonl",
            )
--- a/realhf/impl/model/backend/megatron.py
+++ b/realhf/impl/model/backend/megatron.py
@ -21,7 +21,6 @@ from realhf.api.core import model_api
 from realhf.api.core.data_api import SequenceSample
 from realhf.base import constants, logging, pkg_version
 from realhf.base.datapack import flat2d
-from realhf.base.monitor import CUDATimeMarkType, cuda_tmarked
 from realhf.impl.model.backend.inference import PipelinableInferenceEngine
 from realhf.impl.model.backend.pipe_runner import PipelineRunner, PipeTrainInstrSet
 from realhf.impl.model.modules.mlp import get_activation_fn
@ -304,7 +303,6 @@ class PipeTrainInstrSetForMegatron(PipeTrainInstrSet):
            self._no_sync_context.__exit__(None, None, None)
            self._no_sync_context = None

-    @cuda_tmarked("bwd", CUDATimeMarkType.backward)
    def _exec_backward_pass(
        self,
        module: ReaLModel,
@ -342,7 +340,6 @@ class PipeTrainInstrSetForMegatron(PipeTrainInstrSet):
        # self.engine.ddp.start_grad_sync()
        self.engine.finalize_grads()

-    @cuda_tmarked("opt", CUDATimeMarkType.optim_step)
    def _exec_optimizer_step(
        self,
        module: ReaLModel,
@ -489,8 +486,7 @@ class ReaLMegatronEngine(model_api.PipelinableEngine):
                    loss_scale *= constants.data_parallel_world_size()
                loss_scale *= self.engine.optim.get_loss_scale().item()
                loss *= loss_scale
-                with cuda_tmarked("bwd", CUDATimeMarkType.backward):
-                    loss.backward()
+                loss.backward()

            self.engine.finalize_grads()
            return self._step(version_steps)
@ -530,7 +526,6 @@ class ReaLMegatronEngine(model_api.PipelinableEngine):
        )

    # wrapper for profiler
-    @cuda_tmarked("opt", CUDATimeMarkType.optim_step)
    def _step(self, version_steps):
        # omit the number of zeros in grads
        update_successful, grad_norm, _ = self.engine.optim.step()
--- a/realhf/impl/model/backend/sglang.py
+++ b/realhf/impl/model/backend/sglang.py
@ -32,7 +32,6 @@ from realhf.api.core.model_api import (
    register_backend,
 )
 from realhf.base import (
-    cluster,
    constants,
    datapack,
    gpu_utils,
@ -431,9 +430,9 @@ class SGLangGenerationBackend(ModelBackend, SGLangConfig):
    def _initialize(self, model: Model, spec: FinetuneSpec) -> Model:
        if constants.pipe_parallel_world_size() != 1:
            raise RuntimeError("SGLang does not support pipe parallel size > 1.")
-        if constants.tensor_parallel_world_size() > cluster.spec.n_gpus_per_node:
+        if constants.tensor_parallel_world_size() > torch.cuda.device_count():
            raise RuntimeError(
-                "AReaL's SGLang integration does not support model parallel size > n_gpus_per_node."
+                "AReaL's SGLang integration does not support model parallel size > torch.cuda.device_count()."
            )

        additional_args = dataclasses.asdict(self)
@ -453,6 +452,9 @@ class SGLangGenerationBackend(ModelBackend, SGLangConfig):
                    high=60000,
                    experiment_name=constants.experiment_name(),
                    trial_name=constants.trial_name(),
+                    lockfile_root=os.path.join(
+                        constants.get_cache_path(self.args), "ports"
+                    ),
                ),
                group=constants.data_parallel_group(),
            )
@ -475,10 +477,6 @@ class SGLangGenerationBackend(ModelBackend, SGLangConfig):
            tp_size=constants.tensor_parallel_world_size(),
            # Because we have set CUDA_VISIBLE_DEVICES to a single GPU in each process
            base_gpu_id=int(os.environ["CUDA_VISIBLE_DEVICES"]),
-            file_storage_path=os.path.join(
-                constants.SGLANG_CACHE_PATH,
-                f"sglang_storage{constants.data_parallel_rank()}",
-            ),
            # Data parallelism
            dp_size=1,  # TODO: check whether we require SGLang dp
            load_balance_method="round_robin",
--- a/realhf/impl/model/comm/global_comm.py
+++ b/realhf/impl/model/comm/global_comm.py
@ -46,6 +46,7 @@ def filter_match_mwids(


 def setup_global_comm(
+    args,
    expr_name: str,
    trial_name: str,
    worker_index: int,
@ -87,10 +88,12 @@ def setup_global_comm(
        )

    if constants.use_cuda():
-        assert len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) == 1, os.environ[
-            "CUDA_VISIBLE_DEVICES"
-        ]
-        local_gpu_id = int(os.environ["CUDA_VISIBLE_DEVICES"])
+        if len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) == 1:
+            local_gpu_id = int(os.environ["CUDA_VISIBLE_DEVICES"])
+        else:
+            local_gpu_id = int(
+                os.environ["CUDA_VISIBLE_DEVICES"].split(",")[worker_index]
+            )
    else:
        local_gpu_id = global_rank

@ -100,7 +103,11 @@ def setup_global_comm(

    if worker_index == 0:
        host_ip = socket.gethostbyname(socket.gethostname())
-        port = network.find_free_port(experiment_name=expr_name, trial_name=trial_name)
+        port = network.find_free_port(
+            experiment_name=expr_name,
+            trial_name=trial_name,
+            lockfile_root=os.path.join(constants.get_cache_path(args), "ports"),
+        )
        pg_init_addr = f"tcp://{host_ip}:{port}"
        name_resolve.add(pg_master_name, pg_init_addr, keepalive_ttl=300)
    else:
--- a/realhf/impl/model/interface/math_rw_interface.py
+++ b/realhf/impl/model/interface/math_rw_interface.py
@ -181,6 +181,7 @@ def retokenize_and_verify(
 class MultiTaskRewardInterface(model_api.ModelInterface):
    dataset_path: str = ""
    tokenizer_path: str = "/storage/models/Qwen__Qwen2.5-1.5B"
+    answer_save_path: str = "."
    output_scaling: float = 1.0
    output_bias: float = 0.0
    rw_type: str = "sparse"
@ -363,10 +364,7 @@ class MultiTaskRewardInterface(model_api.ModelInterface):
    ):
        tik = time.perf_counter()
        gen_file_path = os.path.join(
-            constants.LOG_ROOT,
-            constants.experiment_name(),
-            constants.trial_name(),
-            "generated",
+            self.answer_save_path,
            task_type,
            f"v{model.version.global_step}r{dist.get_rank()}.txt",
        )
@ -386,10 +384,7 @@ class MultiTaskRewardInterface(model_api.ModelInterface):
                _f.write(info + "\n")

        gen_file_path = os.path.join(
-            constants.LOG_ROOT,
-            constants.experiment_name(),
-            constants.trial_name(),
-            "generated_jsonl",
+            self.answer_save_path,
            task_type,
            f"v{model.version.global_step}r{dist.get_rank()}.jsonl",
        )
--- a/realhf/impl/model/nn/real_llm_api.py
+++ b/realhf/impl/model/nn/real_llm_api.py
@ -17,7 +17,6 @@ import transformers
 from realhf.api.core import model_api
 from realhf.api.core.config import ModelName
 from realhf.base import constants, logging, topology
-from realhf.base.monitor import CUDATimeMarkType, cuda_tmark, cuda_tmarked
 from realhf.impl.model.comm.global_comm import NCCLProcessGroupInfo
 from realhf.impl.model.comm.param_realloc import (
    ReparallelizeReceiverStep,
@ -470,13 +469,11 @@ class ReaLModel(nn.Module):
                pp_input_buf[:batch_length] = x.pp_input
                x.pp_input = pp_input_buf

-        tmark_type = CUDATimeMarkType.forward
-        with cuda_tmarked("fwd", tmark_type):
-            # Main forward calls.
-            if not self._offloaded:
-                x, ys = self.__forward(x, ys)
-            else:
-                x, ys = self.__overlapped_load_forward(x, ys)
+        # Main forward calls.
+        if not self._offloaded:
+            x, ys = self.__forward(x, ys)
+        else:
+            x, ys = self.__overlapped_load_forward(x, ys)

        # Resume from padding.
        if (
@ -644,7 +641,6 @@ class ReaLModel(nn.Module):
        self._reparallelize_targets[(from_model_name, to_model_name)] = rtgt

    # FIXME: we can get topo given model name from constants
-    @cuda_tmark("param_realloc", CUDATimeMarkType.mem_layout)
    def build_reparallelized_layers_async(
        self,
        from_model_name: ModelName,
--- a/realhf/impl/model/utils/functional.py
+++ b/realhf/impl/model/utils/functional.py
@ -166,7 +166,7 @@ def build_leave_one_indices(
    )


-def _gather_logprobs(
+def gather_logprobs(
    logits: torch.Tensor,
    labels: torch.Tensor,
 ):
@ -186,24 +186,6 @@ def _gather_logprobs(
    return log_probs_labels


-_gather_logprobs_compiled = None
-
-
-def gather_logprobs(
-    logits: torch.Tensor,
-    labels: torch.Tensor,
-):
-    from realhf.base import cluster
-
-    if cluster.spec.name == "wa180":
-        # torch.compile doesn't work on PPU
-        return _gather_logprobs(logits, labels)
-    global _gather_logprobs_compiled
-    if _gather_logprobs_compiled is None:
-        _gather_logprobs_compiled = torch.compile(_gather_logprobs)
-    return _gather_logprobs_compiled(logits, labels)
-
-
 def gather_packed_shifted_log_probs(
    logits: torch.FloatTensor,
    cu_seqlens: torch.Tensor,
--- a/realhf/scheduler/client.py
+++ b/realhf/scheduler/client.py
@ -5,9 +5,10 @@
 import dataclasses
 import enum
 import subprocess
-from typing import List, Optional
+from typing import TYPE_CHECKING, List, Optional

-from realhf.base.cluster import spec as cluster_spec
+if TYPE_CHECKING:
+    from realhf.api.cli_args import BaseExperimentConfig


 class JobState(enum.Enum):
@ -50,10 +51,11 @@ class JobInfo:

 class SchedulerClient:

-    def __init__(self, expr_name, trial_name):
-        self.expr_name = expr_name
-        self.trial_name = trial_name
-        self.run_name = f"{expr_name}_{trial_name}"
+    def __init__(self, args: "BaseExperimentConfig"):
+        self.args = args
+        self.expr_name = args.experiment_name
+        self.trial_name = args.trial_name
+        self.run_name = f"{self.expr_name}_{self.trial_name}"

    def submit(self, worker_type, cmd, **kwargs):
        """Submits a job to the scheduler. Raises exception if the job is
@ -120,16 +122,10 @@ class SchedulerClient:
        raise NotImplementedError()


-def get_python3_path():
-    if cluster_spec.cluster_type == "ray":
-        return subprocess.check_output(["which", "python3"]).decode("utf-8").strip()
-    return "python3"
-
-
 def remote_worker_cmd(expr_name, trial_name, debug, worker_type):
    # requires information in scheduler package
    return (
-        f"{get_python3_path()} {'' if debug else '-O'} -m realhf.apps.remote worker -w {worker_type} "
+        f"python3 {'' if debug else '-O'} -m realhf.apps.remote worker -w {worker_type} "
        f"-e {expr_name} -f {trial_name} -i {{jobstep_id}} -g {{n_jobsteps}} -r {{worker_submission_index}} "
        f"-p {{wprocs_per_jobstep}} -j {{wprocs_in_job}} -o {{wproc_offset}}"
    )
@ -137,7 +133,7 @@ def remote_worker_cmd(expr_name, trial_name, debug, worker_type):

 def setup_cmd(expr_name, trial_name, debug):
    bash_cmd = (  # f"pip3 install -e $REAL_PACKAGE_PATH --no-build-isolation && "
-        f"{get_python3_path()} {'' if debug else '-O'} -m realhf.apps.remote "
+        f"python3 {'' if debug else '-O'} -m realhf.apps.remote "
        f"reset_name_resolve -e {expr_name} -f {trial_name}"
    )
    # return f"bash -c \"{bash_cmd}\""
@ -146,7 +142,7 @@ def setup_cmd(expr_name, trial_name, debug):

 def control_cmd(expr_name, trial_name, debug, ignore_worker_error, controller_type):
    bash_cmd = (  # f"pip3 install -e $REAL_PACKAGE_PATH --no-build-isolation && "
-        f"{get_python3_path()} {'' if debug else '-O'} -m realhf.apps.remote controller "
+        f"python3 {'' if debug else '-O'} -m realhf.apps.remote controller "
        f"-e {expr_name} -f {trial_name} "
        f"--{'ignore_worker_error' if ignore_worker_error else 'raise_worker_error'} "
        f"--type {controller_type}"
@ -155,25 +151,23 @@ def control_cmd(expr_name, trial_name, debug, ignore_worker_error, controller_ty
    return bash_cmd


-def make(mode, expr_name, trial_name, **kwargs) -> SchedulerClient:
-    if mode == "slurm":
+def make(args: "BaseExperimentConfig", **kwargs) -> SchedulerClient:
+    if args.mode == "slurm":
        from realhf.scheduler.slurm.client import SlurmSchedulerClient

-        schedule_strategy = kwargs.get("schedule_strategy", "empty_first")
-        evaluator = kwargs.get("evaluator", None)
        job_group_id = kwargs.get("job_group_id", None)
        job_group_index = kwargs.get("job_group_index", None)
+        evaluator = kwargs.get("evaluator", None)
        return SlurmSchedulerClient(
-            expr_name,
-            trial_name,
-            schedule_strategy,
+            args,
+            args.schedule_strategy,
            evaluator,
            job_group_id,
            job_group_index,
        )
-    elif mode == "local":
+    elif args.mode == "local":
        from realhf.scheduler.local.client import LocalSchedulerClient

-        return LocalSchedulerClient(expr_name, trial_name)
+        return LocalSchedulerClient(args)
    else:
        raise NotImplementedError(f"Scheduler {mode} not found")
--- a/realhf/scheduler/evaluator.py
+++ b/realhf/scheduler/evaluator.py
@ -6,13 +6,14 @@ import pathlib
 import re
 import subprocess
 import time
-from typing import Dict, Optional
+from typing import Any, Dict, Optional

 import swanlab
 import wandb

 import realhf.api.core.system_api as config_pkg
-from realhf.base import cluster, constants, logging
+from realhf.api.cli_args import BaseExperimentConfig
+from realhf.base import constants, logging

 logger = logging.getLogger("AutomaticEvaluator", "colored")

@ -27,6 +28,7 @@ class EvaluationStepStatus(enum.Enum):

@dataclasses.dataclass
 class EvaluationStep:
+    args: BaseExperimentConfig
    global_step: int
    status: EvaluationStepStatus
    start_time: Optional[float] = None
@ -34,7 +36,7 @@ class EvaluationStep:
    process: Optional[subprocess.Popen] = None

    @staticmethod
-    def from_ckpt_dir(ckpt_dir):
+    def from_ckpt_dir(args, ckpt_dir):
        # NOTE: ckpt_dir should be absolute path
        if pathlib.Path(ckpt_dir).is_symlink():
            return None
@ -44,13 +46,14 @@ class EvaluationStep:
            return None
        _, _, global_step = map(int, match.groups())
        return EvaluationStep(
+            args=args,
            global_step=global_step,
            status=EvaluationStepStatus.PENDING,
            ckpt_dir=ckpt_dir,
        )

    @staticmethod
-    def from_output_dir(output_dir):
+    def from_output_dir(args, output_dir):
        # NOTE: output_dir should be absolute path
        # Should only be called in recover.
        _dir = os.path.basename(output_dir)
@ -59,15 +62,13 @@ class EvaluationStep:
            return None
        global_step = int(match.groups()[0])
        return EvaluationStep(
-            global_step=global_step, status=EvaluationStepStatus.LOGGED
+            args=args, global_step=global_step, status=EvaluationStepStatus.LOGGED
        )

    @property
    def output_dir(self):
        return os.path.join(
-            constants.LOG_ROOT,
-            constants.experiment_name(),
-            constants.trial_name(),
+            constants.get_log_path(self.args),
            "eval_output",
            f"globalstep{self.global_step}",
        )
@ -77,7 +78,7 @@ class EvaluationStep:
        cmd = (
            f"srun --mpi=pmi2 -J {slurm_job_name} --ntasks=1 --cpus-per-task=128 --gres=gpu:8 --mem-per-cpu=12G "
            f"singularity exec --no-home --nv --pid --writable-tmpfs --bind /storage:/storage "
-            f"{config.eval_job_image or cluster.spec.gpu_image} "
+            f"{config.eval_job_image or self.args.cluster.gpu_image} "
            f"bash ./evaluation/sh/install_deps_and_eval.sh {self.ckpt_dir} {self.output_dir} "
            f"{config.data_names} {config.max_gen_tokens} {config.prompt_type}"
        )
@ -86,7 +87,7 @@ class EvaluationStep:
    def submit(self, config: config_pkg.AutomaticEvaluator):
        os.makedirs(self.output_dir, exist_ok=True)
        log_file = open(os.path.join(self.output_dir, "output.log"), "w")
-        if cluster.spec.cluster_type == "slurm":
+        if self.args.mode == "slurm":
            cmd = self.slurm_eval_cmd(config)
        else:
            raise NotImplementedError(
@ -155,10 +156,12 @@ class AutomaticEvaluator:

    def __init__(
        self,
+        args: BaseExperimentConfig,
        config: config_pkg.AutomaticEvaluator,
        wandb_config: config_pkg.WandBConfig,
        swanlab_config: config_pkg.SwanlabConfig,
    ):
+        self.args = args
        self.__eval_steps: Dict[int, EvaluationStep] = {}
        self.__max_concurrent_jobs = config.max_concurrent_jobs
        self.__wandb_config = wandb_config
@ -174,15 +177,13 @@ class AutomaticEvaluator:
        # Resubmiting or waiting for these jobs will probably result in
        # unexpected behaviors.
        output_parent = os.path.join(
-            constants.LOG_ROOT,
-            constants.experiment_name(),
-            constants.trial_name(),
+            constants.get_log_path(args),
            "eval_output",
        )
        if os.path.exists(output_parent):
            for output_dir in os.listdir(output_parent):
                output_dir = os.path.join(output_parent, output_dir)
-                eval_step = EvaluationStep.from_output_dir(output_dir)
+                eval_step = EvaluationStep.from_output_dir(self.args, output_dir)
                if eval_step:
                    self.__eval_steps[eval_step.global_step] = eval_step

@ -197,12 +198,13 @@ class AutomaticEvaluator:
        )
        if self.__config.initial_checkpoint_path and 0 not in self.__eval_steps:
            self.__eval_steps[0] = EvaluationStep(
+                args=self.args,
                global_step=0,
                status=EvaluationStepStatus.PENDING,
                ckpt_dir=self.__config.initial_checkpoint_path,
            )

-        if not cluster.spec.cluster_type == "slurm":
+        if not self.args.mode == "slurm":
            raise NotImplementedError(
                "Currently only support automatic evaluation for slurm"
            )
@ -224,9 +226,7 @@ class AutomaticEvaluator:
            notes=self.__wandb_config.notes,
            tags=self.__wandb_config.tags,
            config=self.__wandb_config.config,
-            dir=os.path.join(
-                constants.LOG_ROOT, constants.experiment_name(), constants.trial_name()
-            ),
+            dir=constants.get_log_path(self.args),
            force=True,
            id=f"{constants.experiment_name()}_{constants.trial_name()}_eval",
            resume="allow",
@ -270,15 +270,13 @@ class AutomaticEvaluator:
    def step(self):
        # Check whether a new evaluation step should be created
        ckpt_parent = os.path.join(
-            constants.MODEL_SAVE_ROOT,
-            constants.experiment_name(),
-            constants.trial_name(),
+            constants.get_save_path(self.args),
            "actor",
        )
        if os.path.exists(ckpt_parent):
            for ckpt_dir in os.listdir(ckpt_parent):
                ckpt_dir = os.path.join(ckpt_parent, ckpt_dir)
-                eval_step = EvaluationStep.from_ckpt_dir(ckpt_dir)
+                eval_step = EvaluationStep.from_ckpt_dir(self.args, ckpt_dir)
                if eval_step is None:
                    continue
                if eval_step.global_step in self.__eval_steps:
--- a/realhf/scheduler/local/client.py
+++ b/realhf/scheduler/local/client.py
@ -14,7 +14,7 @@ import psutil

 import realhf.base.logging as logging
 from realhf.base import gpu_utils
-from realhf.base.constants import LOG_ROOT
+from realhf.base.constants import get_log_path
 from realhf.scheduler.client import (
    JobException,
    JobInfo,
@ -75,14 +75,12 @@ class LocalSchedulerClient(SchedulerClient):

    def log_path_of(self, worker_type) -> str:
        return os.path.join(
-            LOG_ROOT,
-            self.expr_name,
-            self.trial_name,
+            get_log_path(self.args),
            f"{worker_type}-0",
        )

-    def __init__(self, expr_name, trial_name):
-        super().__init__(expr_name, trial_name)
+    def __init__(self, args):
+        super().__init__(args)
        self._jobs: Dict[str, subprocess.Popen] = {}
        self._running_worker_types = []

--- a/realhf/scheduler/slurm/client.py
+++ b/realhf/scheduler/slurm/client.py
@ -15,9 +15,7 @@ from typing import Dict, List, Literal, Optional, Tuple
 import colorama

 import realhf.base.logging as logging
-from realhf.base.cluster import spec as cluster_spec
-from realhf.base.constants import LOG_ROOT
-from realhf.base.constants import SLURM_LOCK_FILE_NAME as LOCK_FILE_NAME
+from realhf.base.constants import get_log_path
 from realhf.scheduler.client import JobException, JobInfo, JobState, SchedulerClient
 from realhf.scheduler.evaluator import AutomaticEvaluator
 from realhf.scheduler.slurm.utils import (
@ -82,14 +80,13 @@ class SlurmSchedulerClient(SchedulerClient):

    def __init__(
        self,
-        expr_name: str,
-        trial_name: str,
+        args,
        schedule_strategy: str,
        evaluator: Optional[AutomaticEvaluator],
        job_group_id: str,
        job_group_index: int,
    ):
-        super().__init__(expr_name, trial_name)
+        super().__init__(args)

        self.__schedule_strategy = schedule_strategy

@ -124,11 +121,31 @@ class SlurmSchedulerClient(SchedulerClient):
        deadline: str = None,
        time_limit: str = None,
    ):
-        container_image = container_image or cluster_spec.cpu_image
-        container_mounts = container_mounts or cluster_spec.mount
+        container_image = container_image or self.args.cluster.cpu_image
+        container_mounts = container_mounts or self.args.cluster.mount
        # record launch information, do not submit to slurm until `wait()` is called
        # NOTE: fractional GPU requirement will be resolved automatically in `__post_init__` of SlurnLaunchInfo
+        log_path = os.path.join(
+            get_log_path(self.args),
+            f"{worker_type}-{self.__submission_counter[worker_type]}.log",
+        )
+        multiprog_path = os.path.join(
+            get_log_path(self.args),
+            "slurm",
+            "multiprog",
+            f"{worker_type}-{self.__submission_counter[worker_type]}.multiprog",
+        )
+        os.makedirs(os.path.dirname(multiprog_path), exist_ok=True)
+        hostfile_path = os.path.join(
+            get_log_path(self.args),
+            "slurm",
+            "hostfile",
+            f"{worker_type}-{self.__submission_counter[worker_type]}.hostfile",
+        )
+        os.makedirs(os.path.dirname(hostfile_path), exist_ok=True)
+
        launch_info = SlurmLaunchInfo(
+            args=self.args,
            worker_type=worker_type,
            wprocs_in_job=count,
            resource_requirement=SlurmResource(mem=mem, cpu=cpu, gpu=gpu),
@ -149,6 +166,9 @@ class SlurmSchedulerClient(SchedulerClient):
            time_limit=time_limit,
            job_group_id=self.__job_group_id,
            job_group_index=self.__job_group_index,
+            log_path=log_path,
+            multiprog_path=multiprog_path,
+            hostfile_path=hostfile_path,
        )

        if (
@ -177,9 +197,9 @@ class SlurmSchedulerClient(SchedulerClient):
            wproc_offset=self.__wprocs_counter[worker_type],
        )
        wrap_cmd = "singularity exec "
-        if cluster_spec.name == "na132":
+        if self.args.cluster.cluster_name == "na132":
            wrap_cmd += "--pid "
-        if cluster_spec.gpu_type == "tesla":
+        if self.args.cluster.gpu_type == "tesla":
            wrap_cmd += "--nv "
        wrap_cmd += "--no-home --writable-tmpfs "
        if len(launch_info.env_vars) > 0:
@ -199,7 +219,9 @@ class SlurmSchedulerClient(SchedulerClient):
        start_time = time.monotonic()
        while True:
            try:
-                fp = open(LOCK_FILE_NAME, "w")
+                fp = open(
+                    f"{self.args.cluster.fileroot}/logs/slurm_scheduler.lock", "w"
+                )
                fcntl.flock(fp, fcntl.LOCK_EX)
                infos = list(self.__pending_jobs.values())
                infos = allocate_resources(infos, strategy=self.__schedule_strategy)
@ -297,9 +319,7 @@ class SlurmSchedulerClient(SchedulerClient):
        threads = []
        stop_events = []

-        merged_log_path = os.path.join(
-            LOG_ROOT, self.expr_name, self.trial_name, "main.log"
-        )
+        merged_log_path = os.path.join(get_log_path(self.args), "main.log")

        for job_name, launch_info in self.__committed_jobs.items():
            stop_event = threading.Event()
--- a/realhf/scheduler/slurm/utils.py
+++ b/realhf/scheduler/slurm/utils.py
@ -20,10 +20,10 @@ from typing import Callable, Dict, List, Literal, Optional, Union

 import pandas as pd

-import realhf.base.cluster as cluster
 import realhf.base.logging as logging
 import realhf.version as version
-from realhf.base.constants import LOG_ROOT
+from realhf.api.cli_args import BaseExperimentConfig
+from realhf.base.constants import get_log_path
 from realhf.scheduler.client import JobException, JobInfo, JobState

 logger = logging.getLogger("scheduler.slurm.utils")
@ -190,6 +190,7 @@ class SlurmLaunchInfo:
        multiprog_content (str, optional): The content of the multiprog file.
    """

+    args: BaseExperimentConfig
    run_name: str
    exper_name: str
    trial_name: str
@ -199,6 +200,10 @@ class SlurmLaunchInfo:
    job_group_id: str
    job_group_index: str

+    log_path: str
+    multiprog_path: str
+    hostfile_path: str
+
    resource_requirement: SlurmResource
    cmd: str
    container_image: str
@ -252,41 +257,6 @@ class SlurmLaunchInfo:
        else:
            return None

-    @property
-    def log_path(self) -> str:
-        return os.path.join(
-            LOG_ROOT,
-            self.exper_name,
-            self.trial_name,
-            f"{self.worker_type}-{self.worker_submission_idx}.log",
-        )
-
-    @property
-    def multiprog_path(self) -> str:
-        path = os.path.join(
-            LOG_ROOT,
-            self.exper_name,
-            self.trial_name,
-            "slurm",
-            "multiprog",
-            f"{self.worker_type}-{self.worker_submission_idx}.multiprog",
-        )
-        os.makedirs(os.path.dirname(path), exist_ok=True)
-        return path
-
-    @property
-    def hostfile_path(self) -> str:
-        path = os.path.join(
-            LOG_ROOT,
-            self.exper_name,
-            self.trial_name,
-            "slurm",
-            "hostfile",
-            f"{self.worker_type}-{self.worker_submission_idx}.hostfile",
-        )
-        os.makedirs(os.path.dirname(path), exist_ok=True)
-        return path
-
    def show_log(self):
        try:
            terminal_columns = os.get_terminal_size().columns
@ -364,14 +334,14 @@ class SlurmLaunchInfo:
        # head
        gres_line = ""
        if gpu >= 1:
-            assert (gpu * ntasks) % cluster.spec.n_gpus_per_node == 0
+            assert (gpu * ntasks) % self.args.cluster.n_gpus_per_node == 0
            # In current slurm cluster setup, we can only use "--gres" to
            # allocate PPUs per node. There are no options to allocate customized
            # gres per tasks.
-            if cluster.spec.gpu_type == "ppu":
-                gres_line = f"--gres=ppu:{cluster.spec.n_gpus_per_node}"
+            if self.args.cluster.gpu_type == "ppu":
+                gres_line = f"--gres=ppu:{self.args.cluster.n_gpus_per_node}"
            else:
-                gres_line = f"--gres=gpu:{cluster.spec.n_gpus_per_node}"
+                gres_line = f"--gres=gpu:{self.args.cluster.n_gpus_per_node}"

        srun_env = os.environ.copy()
        job_metadata = {
@ -391,7 +361,7 @@ class SlurmLaunchInfo:
            f"#SBATCH --output={self.log_path}",
            "#SBATCH --open-mode=append",
            f"#SBATCH --ntasks={ntasks}",
-            f"#SBATCH {gres_line}",
+            f"#SBATCH {gres_line}" if gpu >= 1 else "",
            f"#SBATCH --cpus-per-task={cpu}",
            f"#SBATCH --mem-per-cpu={mem // max(1, cpu)}M",
            "#SBATCH --distribution=arbitrary" if self.hostfile else "",
@ -722,6 +692,9 @@ def allocate_resources(
        infos, key=lambda x: x.n_jobsteps * x.resource_requirement, reverse=True
    )
    prioritized_hosts = set()
+    if len(infos) == 0:
+        return infos
+    cluster_config = infos[0].args.cluster
    for info_idx, info in enumerate(infos):
        valid_hostnames = available_hostnames(
            nodelist=info.nodelist,
@ -764,12 +737,12 @@ def allocate_resources(
                gpu_per_task = info.resource_requirement.gpu
                if gpu_per_task > 0:
                    assert (
-                        task_left * gpu_per_task % cluster.spec.n_gpus_per_node == 0
+                        task_left * gpu_per_task % cluster_config.n_gpus_per_node == 0
                    ), (task_left, gpu_per_task)
                    assert (
-                        cluster.spec.n_gpus_per_node % gpu_per_task == 0
+                        cluster_config.n_gpus_per_node % gpu_per_task == 0
                    ), gpu_per_task
-                    batched_ntasks = int(cluster.spec.n_gpus_per_node // gpu_per_task)
+                    batched_ntasks = int(cluster_config.n_gpus_per_node // gpu_per_task)
                    batched_requirement = batched_ntasks * info.resource_requirement
                try:
                    resource = resource - batched_requirement
@ -787,10 +760,10 @@ def allocate_resources(
                allocated[hostname] = tmp - task_left
            all_resources[hostname] = resource
        if task_left > 0:
-            if cluster.spec.gpu_type == "ppu" and info.resource_requirement.gpu > 0:
+            if cluster_config.gpu_type == "ppu" and info.resource_requirement.gpu > 0:
                logger.warning(
                    "For PPU resources, we can only allocate tasks in the "
-                    f"granularity of nodes ({cluster.spec.n_gpus_per_node} PPUs)"
+                    f"granularity of nodes ({cluster_config.n_gpus_per_node} PPUs)"
                )
            logger.warning(
                f'Unable to allocate {info.n_jobsteps} Jobs with name "{info.slurm_name}". '
--- a/realhf/system/init.py
+++ b/realhf/system/init.py
@ -61,7 +61,7 @@ def run_worker(
    )
    worker = worker_class(server=server)
    try:
-        if worker_type in ["rollout_worker", "master_worker", "gserver_manager"]:
+        if worker_type in ["rollout_worker", "master_worker"]:
            asyncio.run(worker.run_async())
        else:
            worker.run()
--- a/realhf/system/controller.py
+++ b/realhf/system/controller.py
@ -27,7 +27,6 @@ from omegaconf import OmegaConf

 import realhf.api.core.system_api as system_api
 from realhf.base import constants, gpu_utils, logging, name_resolve, names, pkg_version
-from realhf.base.cluster import spec as cluster_spec
 from realhf.system import WORKER_TYPES, load_worker, worker_base, worker_control
 from realhf.system.worker_base import WorkerServerStatus as Wss

@ -247,9 +246,7 @@ class Controller:

        # If a log exists, find the last failed setup and run it.
        start_idx = 0
-        prev_logfile = os.path.join(
-            constants.LOG_ROOT, self.experiment_name, self.trial_name, "ctl-0"
-        )
+        prev_logfile = os.path.join(constants.get_log_path(experiment), "ctl-0")
        if os.path.exists(prev_logfile):
            with open(prev_logfile, "r") as f:
                for l in f.readlines():
@ -670,6 +667,7 @@ class RayController:
        ]

        env_vars = constants.get_env_vars(
+            experiment,
            REAL_MODE=os.environ.get("REAL_MODE", ""),
            REAL_RECOVER_RUN=os.environ.get("REAL_RECOVER_RUN", ""),
            REAL_SAVE_RECOVER_STATES=os.environ.get("REAL_SAVE_RECOVER_STATES", ""),
--- a/realhf/system/function_executor.py
+++ b/realhf/system/function_executor.py
@ -24,6 +24,7 @@ blogger = logging.getLogger("benchmark")
 class FunctionExecutor:
    def __init__(
        self,
+        args,
        rpcs: List[MFCDef],
        msid2mwid: Dict[ModelShardID, int],
        stream: NameResolvingRequestClient,
@ -35,6 +36,8 @@ class FunctionExecutor:
        shuffle_dataset: bool,
    ):

+        self.args = args
+
        self.func_calls: Dict[str, ModelFunctionCall] = {}
        self.ctrl = ctrl

@ -42,7 +45,9 @@ class FunctionExecutor:
        self.msid2mwid = msid2mwid

        self.storage_tracker = GlobalStorageTracker(self.n_model_workers)
-        self.redistrib_planner = RedistribPlanner(self.storage_tracker)
+        self.redistrib_planner = RedistribPlanner(
+            self.args.cluster, self.storage_tracker
+        )

        self.rpcs = rpcs
        self.src_rpc = list(filter(lambda rpc: rpc.is_src, rpcs))[0]
@ -51,6 +56,7 @@ class FunctionExecutor:
        # Create model function calls.
        for rpc in self.rpcs:
            func_call = ModelFunctionCall(
+                args=self.args,
                rpc=rpc,
                src_rpc=self.src_rpc,
                stream=stream,
--- a/realhf/system/generation_server.py
+++ b/realhf/system/generation_server.py
@ -20,7 +20,6 @@ from realhf.base import (
    pkg_version,
    seeding,
 )
-from realhf.base.cluster import spec as cluster_spec
 from realhf.system.worker_base import PollResult, Worker

 logger = logging.getLogger(__name__)
@ -139,7 +138,7 @@ class GenerationServer(Worker):
                map(str, range(gpu_utils.gpu_count()))
            )
        else:
-            servers_per_node = cluster_spec.n_gpus_per_node // self.config.tp_size
+            servers_per_node = self.args.cluster.n_gpus_per_node // self.config.tp_size
            idx_on_this_node = self.worker_index % servers_per_node
            self.base_gpu_id = idx_on_this_node * self.config.tp_size

@ -159,7 +158,7 @@ class GenerationServer(Worker):
        # NOTE: Ports returned by `find_multiple_free_ports` are unique,
        # but SGLang servers still encounter conflicts.
        # Use a clearance period to hack over this issue.
-        servers_per_node = cluster_spec.n_gpus_per_node // self.config.tp_size
+        servers_per_node = self.args.cluster.n_gpus_per_node // self.config.tp_size
        idx_on_this_node = self.worker_index % servers_per_node
        time.sleep(idx_on_this_node * PORT_CLEARANCE_PERIOD / servers_per_node)

@ -169,6 +168,7 @@ class GenerationServer(Worker):
            high=60000,
            experiment_name=self.experiment_name,
            trial_name=self.trial_name,
+            lockfile_root=os.path.join(constants.get_cache_path(self.args), "ports"),
        )
        server_port = ports[0]
        nccl_port = ports[1]
--- a/realhf/system/gserver_manager.py
+++ b/realhf/system/gserver_manager.py
@ -29,7 +29,7 @@ class AllocateRolloutInput:
    qid: str


-class GserverManager(AsyncWorker):
+class GserverManager(Worker):
    """This worker has the following functionalities:
    1. As a router, it schedules generation requests and returns the
       best server urls to clients for submitting generation requests.
@ -71,7 +71,7 @@ class GserverManager(AsyncWorker):
        self.server_urls = []

        # recover info
-        self.__recover_run, self.__recover_info = recover.load_recover_info()
+        self.__recover_run, self.__recover_info = recover.load_recover_info(self.args)
        if self.__recover_run:
            # update weights will be automatically triggered upon the first schedule_request
            # self._last_param_realloc_step will also be updated
@ -110,11 +110,7 @@ class GserverManager(AsyncWorker):
        epoch = self.__recover_info.last_step_info.epoch + 1
        epochstep = self.__recover_info.last_step_info.epoch_step + 1
        globalstep = self.__recover_info.last_step_info.global_step + 1
-        save_root = os.path.join(
-            constants.MODEL_SAVE_ROOT,
-            constants.experiment_name(),
-            constants.trial_name(),
-        )
+        save_root = constants.get_save_path(self.args)
        role_path = os.path.join(save_root, role)
        if not os.path.exists(role_path):
            raise RuntimeError(
@ -150,9 +146,7 @@ class GserverManager(AsyncWorker):
                self._loaded_recover_weights = True
            else:
                realloc_dir = os.path.join(
-                    constants.PARAM_REALLOC_PATH,
-                    constants.experiment_name(),
-                    constants.trial_name(),
+                    constants.get_param_realloc_path(self.args),
                    self.model_name.role,
                    str(realloc_version),
                )
@ -213,7 +207,7 @@ class GserverManager(AsyncWorker):
        url = min(self.server_urls, key=lambda k: self._server_token_usage[k])
        return self.server_urls.index(url)

-    async def _poll_async(self):
+    def _poll(self):
        if not self.thread:
            # Find addresses of generation servers
            self.server_urls = self._discover_servers(self.config.n_servers)
@ -292,9 +286,7 @@ class GserverManager(AsyncWorker):

        # clear old weights
        realloc_root = os.path.join(
-            constants.PARAM_REALLOC_PATH,
-            constants.experiment_name(),
-            constants.trial_name(),
+            constants.get_param_realloc_path(self.args),
            self.model_name.role,
        )
        if os.path.exists(realloc_root):
@ -483,6 +475,7 @@ class GserverManager(AsyncWorker):
        port = network.find_free_port(
            experiment_name=self.experiment_name,
            trial_name=self.trial_name,
+            lockfile_root=os.path.join(constants.get_cache_path(self.args), "ports"),
        )
        self.manager_addr = f"{network.gethostip()}:{port}"

--- a/realhf/system/master_worker.py
+++ b/realhf/system/master_worker.py
@ -97,15 +97,8 @@ class MasterWorker(worker_base.AsyncWorker):
            freq_sec=config.exp_ctrl.eval_freq_secs,
        )

-        self.MODEL_SAVE_ROOT = os.path.join(
-            constants.MODEL_SAVE_ROOT,
-            config.worker_info.experiment_name,
-            config.worker_info.trial_name,
-        )
-        os.makedirs(self.MODEL_SAVE_ROOT, exist_ok=True)
-
        self.__initialized = False
-        self.__recover_run, self.__recover_info = recover.load_recover_info()
+        self.__recover_run, self.__recover_info = recover.load_recover_info(self.args)
        if self.__recover_info is not None:
            logger.info(
                f"Loaded recover info: recover_start={self.__recover_info.recover_start}, "
@ -305,9 +298,7 @@ class MasterWorker(worker_base.AsyncWorker):
            notes=self.wandb_config.notes,
            tags=self.wandb_config.tags,
            config=self.wandb_config.config,
-            dir=os.path.join(
-                constants.LOG_ROOT, constants.experiment_name(), constants.trial_name()
-            ),
+            dir=constants.get_log_path(self.args),
            force=True,
            id=f"{constants.experiment_name()}_{constants.trial_name()}_train",
            resume="allow",
@ -355,6 +346,7 @@ class MasterWorker(worker_base.AsyncWorker):
        # Create coroutines for model RPCs.
        logger.debug(f"Creating asyncio coroutines...")
        self.func_executor = FunctionExecutor(
+            args=self.args,
            rpcs=self.__model_rpcs,
            msid2mwid=self.config.msid2mwid,
            stream=self.__stream,
@ -599,7 +591,7 @@ class MasterWorker(worker_base.AsyncWorker):
            hash_vals_to_ignore=self.__rpc_ctrl.used_hash_vals_this_epoch,
        )

-        recover.dump_recover_info(recover_info)
+        recover.dump_recover_info(self.args, recover_info)
        logger.info("Dumped recover info to file.")
        logger.info(f"Will recover from: {recover_info.recover_start}")
        logger.info(
--- a/realhf/system/model_function_call.py
+++ b/realhf/system/model_function_call.py
@ -56,6 +56,7 @@ class RPCCorountineControl:
 class ModelFunctionCall:
    def __init__(
        self,
+        args,
        rpc: dfg.MFCDef,
        src_rpc: dfg.MFCDef,
        stream: request_reply_stream.NameResolvingRequestClient,
@ -68,6 +69,8 @@ class ModelFunctionCall:
        summary_writer: SummaryWriter | None,
    ):

+        self.args = args
+
        self.rpc = rpc
        self.src_rpc = src_rpc
        self.stream = stream
@ -82,12 +85,6 @@ class ModelFunctionCall:
        for msid, mwid in msid2mwid.items():
            self.mwid2msids[mwid].append(msid)

-        self.model_save_root = os.path.join(
-            constants.MODEL_SAVE_ROOT,
-            constants.experiment_name(),
-            constants.trial_name(),
-        )
-
        self.rpc_ctrl = ctrl
        self.buffers = buffers
        self.redistrib_planner = redistrib_planner
@ -221,7 +218,7 @@ class ModelFunctionCall:
            for p in payloads.values():
                p.post_hooks.append("save")
                save_dir = os.path.join(
-                    self.model_save_root,
+                    constants.get_log_path(self.args),
                    rpc.model_name.role,
                    f"epoch{ctrl.step_info.epoch + 1}"
                    f"epochstep{ctrl.step_info.epoch_step + 1}"
--- a/realhf/system/model_worker.py
+++ b/realhf/system/model_worker.py
@ -43,12 +43,6 @@ from realhf.base import (
    timeutil,
    topology,
 )
-from realhf.base.monitor import (
-    CUDATimeMarkType,
-    cuda_tmark,
-    cuda_tmarked,
-    dump_tmark_db,
-)
 from realhf.impl.model.nn.real_llm_api import ReaLModel
 from realhf.impl.model.utils import cuda_graph
 from realhf.system import request_reply_stream, worker_base
@ -136,7 +130,7 @@ class ModelWorker(worker_base.Worker):
        r = self.config.worker_info

        # recover info
-        self.__recover_run, self.__recover_info = recover.load_recover_info()
+        self.__recover_run, self.__recover_info = recover.load_recover_info(self.args)

        # Whether to enable profiling is controlled by the following environment variables.
        self.__enable_profiler = os.getenv("REAL_DUMP_TRACE", "0") == "1"
@ -174,11 +168,7 @@ class ModelWorker(worker_base.Worker):
        epoch = self.__recover_info.last_step_info.epoch + 1
        epochstep = self.__recover_info.last_step_info.epoch_step + 1
        globalstep = self.__recover_info.last_step_info.global_step + 1
-        save_root = os.path.join(
-            constants.MODEL_SAVE_ROOT,
-            constants.experiment_name(),
-            constants.trial_name(),
-        )
+        save_root = constants.get_save_path(self.args)
        if epoch > 0:
            role_path = os.path.join(save_root, role)
            if os.path.exists(role_path):
@ -251,6 +241,7 @@ class ModelWorker(worker_base.Worker):
        )

        self.__pg_info = global_comm.setup_global_comm(
+            args=self.args,
            expr_name=self.__experiment_name,
            trial_name=self.__trial_name,
            worker_index=self.__worker_index,
@ -312,13 +303,6 @@ class ModelWorker(worker_base.Worker):
                    self.__dataset_dp_rank,
                    self.__dataset_dp_size,
                    self.config.tokenizer_name_or_path,
-                    self.config.worker_info.experiment_name,
-                    self.config.worker_info.trial_name,
-                    cache_root=(
-                        None
-                        if not self.config.use_dataset_cache
-                        else self.config.dataset_cahce_root
-                    ),
                )
                for d in self.config.datasets
            ]
@ -388,9 +372,7 @@ class ModelWorker(worker_base.Worker):
                                and hasattr(d, "filter")
                            ):
                                dataset_indices_path = os.path.join(
-                                    constants.MODEL_SAVE_ROOT,
-                                    constants.experiment_name(),
-                                    constants.trial_name(),
+                                    constants.get_save_path(self.args),
                                    "dataset_indices",
                                    f"{self._dp_rank}_{i}.npy",
                                )
@ -438,13 +420,6 @@ class ModelWorker(worker_base.Worker):
                        s.id.dp_rank,
                        s.id.topo.get_dim("data"),
                        self.__models[s.id.model_name].tokenizer,
-                        self.config.worker_info.experiment_name,
-                        self.config.worker_info.trial_name,
-                        cache_root=(
-                            None
-                            if not self.config.use_dataset_cache
-                            else self.config.dataset_cahce_root
-                        ),
                    )
                    eval_dataloader = torch.utils.data.DataLoader(
                        eval_dataset,
@ -497,16 +472,15 @@ class ModelWorker(worker_base.Worker):
            self.__param_realloc(hook_data)
        elif hook == "offload":
            # NOTE: Profiling (or cuda synchronization) will cause an overhead ~0.5s.
-            with cuda_tmarked("offload", CUDATimeMarkType.mem_layout):
-                m = self.__unwrapped_models[hook_data["model_name"]]
-                if not isinstance(m, ReaLModel):
-                    logger.warning(
-                        f"Model {hook_data['model_name']} (type={type(m)}) is not a ReaLModel, "
-                        f"so it can't use offload."
-                    )
-                    return
-                if not m._offloaded:
-                    m.async_offload()
+            m = self.__unwrapped_models[hook_data["model_name"]]
+            if not isinstance(m, ReaLModel):
+                logger.warning(
+                    f"Model {hook_data['model_name']} (type={type(m)}) is not a ReaLModel, "
+                    f"so it can't use offload."
+                )
+                return
+            if not m._offloaded:
+                m.async_offload()
        elif hook == "save":
            self.__save_model(hook_data)
        elif hook == "evaluate":
@ -602,15 +576,11 @@ class ModelWorker(worker_base.Worker):
            except StopIteration:
                # Upon the first fetch request, filter dataset and create dataloader.
                eval_scores_path = os.path.join(
-                    constants.MODEL_SAVE_ROOT,
-                    constants.experiment_name(),
-                    constants.trial_name(),
+                    constants.get_save_path(self.args),
                    "dataset_eval_scores.json",
                )
                dataset_indices_path = os.path.join(
-                    constants.MODEL_SAVE_ROOT,
-                    constants.experiment_name(),
-                    constants.trial_name(),
+                    constants.get_save_path(self.args),
                    "dataset_indices",
                    f"{dp_rank}_{dataset_id}.npy",
                )
@ -668,7 +638,7 @@ class ModelWorker(worker_base.Worker):
                    continue
                if self.data_manager.has_data(x.ids[0]):
                    continue
-                data_loaded.append(x)
+                data_loaded.append(x.cpu())
                self.data_manager.store(x)
            assert len(set([x.ids[0] for x in data_loaded])) == len(data_loaded)

@ -700,25 +670,23 @@ class ModelWorker(worker_base.Worker):
                "dataset_size": self.dataset_size,
            }
        elif request.handle_name == "clear_data_cache":
-            with cuda_tmarked("clear_data_cache", CUDATimeMarkType.misc):
-                ids = request.data
-                self.data_manager.remove(ids)
-                gc.collect()
-                if (
-                    self.config.cuda_cache_cleanliness
-                    and self.__clear_cache_frequency.check()
-                ):
-                    st = time.monotonic()
-                    self._clear_memory(force=True)
-                    et = time.monotonic()
-                    blogger.debug(
-                        f"Model worker {self.__worker_index} cleared cache in {et-st:.4f}s. "
-                    )
+            ids = request.data
+            self.data_manager.remove(ids)
+            gc.collect()
+            if (
+                self.config.cuda_cache_cleanliness
+                and self.__clear_cache_frequency.check()
+            ):
+                st = time.monotonic()
+                self._clear_memory(force=True)
+                et = time.monotonic()
+                blogger.debug(
+                    f"Model worker {self.__worker_index} cleared cache in {et-st:.4f}s. "
+                )
            logger.debug(
-                "Get clear_data_cache, dump cuda tmark. "
+                "Get clear_data_cache. "
                f"Remaining data in local storage: {self.data_manager.storage_size()}. "
            )
-            dump_tmark_db(self.__worker_index)
            res = request_reply_stream.NoResponse()
        self.__reply_queue.put_nowait((request, res))
        self.__request_sample_size[request.request_id] = 1
@ -820,9 +788,7 @@ class ModelWorker(worker_base.Worker):
            tik = time.perf_counter()
            global_step = self.__models[model_name].version.global_step
            realloc_dir = os.path.join(
-                constants.PARAM_REALLOC_PATH,
-                constants.experiment_name(),
-                constants.trial_name(),
+                constants.get_param_realloc_path(self.args),
                model_name.role,
                str(global_step),
            )
@ -852,9 +818,7 @@ class ModelWorker(worker_base.Worker):

    def _get_setup_logdir(self, name):
        subdir = os.path.join(
-            constants.LOG_ROOT,
-            constants.experiment_name(),
-            constants.trial_name(),
+            constants.get_log_path(self.args),
            name,
            f"setup{self._setup_counter}",
        )
@ -990,9 +954,7 @@ class ModelWorker(worker_base.Worker):
                raise NotImplementedError(f"Unknown MFC type: {request.handle_name}.")

        eval_scores_path = os.path.join(
-            constants.MODEL_SAVE_ROOT,
-            constants.experiment_name(),
-            constants.trial_name(),
+            constants.get_save_path(self.args),
            "dataset_eval_scores.json",
        )
        eval_scores = {}
@ -1061,7 +1023,6 @@ class ModelWorker(worker_base.Worker):
        dist.barrier(group=constants.cpu_parallelism_group())
        return res

-    @cuda_tmark("data_transfer", CUDATimeMarkType.comm)
    def __data_transfer_among_workers(self, hook_data: Dict[str, Any]):
        meta_sample = hook_data["meta_sample"]

@ -1130,9 +1091,7 @@ class ModelWorker(worker_base.Worker):
            global_step = int(global_step.item())

            realloc_dir = os.path.join(
-                constants.PARAM_REALLOC_PATH,
-                constants.experiment_name(),
-                constants.trial_name(),
+                constants.get_param_realloc_path(self.args),
                from_model_name.role,
                str(global_step),
            )
@ -1284,7 +1243,6 @@ class ModelWorker(worker_base.Worker):
                    f"Time consumption: {float(t):.4f}s."
                )

-    @cuda_tmark("post_response", CUDATimeMarkType.misc)
    def maybe_post_responses(self):
        ready_to_post = []
        while True:
@ -1335,7 +1293,6 @@ class ModelWorker(worker_base.Worker):
            time.sleep(_MODEL_WORKER_POLL_REQUESTS_INTERVAL_SECS)
            pass

-    @cuda_tmark("receive_request", CUDATimeMarkType.misc)
    def maybe_receive_requests(self):
        tik = time.perf_counter()
        while time.perf_counter() - tik < _MODEL_WORKER_POLL_REQUESTS_SECS:
--- a/realhf/system/push_pull_stream.py
+++ b/realhf/system/push_pull_stream.py
@ -1,4 +1,5 @@
 import logging
+import os
 from queue import Empty as QueueEmpty
 from typing import Any, Dict, List, Optional, Union

@ -6,7 +7,7 @@ import orjson
 import zmq
 from zmq.utils.strtypes import asbytes

-from realhf.base import logging, name_resolve, names, network
+from realhf.base import constants, logging, name_resolve, names, network

 logger = logging.getLogger("ZMQ Push-Pull Stream")

@ -160,12 +161,16 @@ class NameResolvingZmqPusher(ZMQJsonPusher):


 class NameResolvingZmqPuller(ZMQJsonPuller):
-    def __init__(self, experiment_name, trial_name, puller_index, **kwargs):
+    def __init__(self, args, puller_index, **kwargs):
+        experiment_name = args.experiment_name
+        trial_name = args.trial_name
        name = names.push_pull_stream(
            experiment_name, trial_name, stream_name=f"puller{puller_index}"
        )
        host, port = network.gethostip(), network.find_free_port(
-            experiment_name=experiment_name, trial_name=trial_name
+            experiment_name=experiment_name,
+            trial_name=trial_name,
+            lockfile_root=os.path.join(constants.get_cache_path(args), "ports"),
        )
        addr = f"{host}:{port}"
        name_resolve.add(name, addr)
--- a/realhf/system/redistributor.py
+++ b/realhf/system/redistributor.py
@ -6,7 +6,7 @@ import itertools
 from collections import defaultdict
 from typing import *

-from realhf.base.cluster import spec as cluster_spec
+from realhf.api.cli_args import ClusterSpecConfig


 class GlobalStorageTracker:
@ -70,7 +70,10 @@ class RedistribStep:


 class RedistribPlanner:
-    def __init__(self, storage_tracker: GlobalStorageTracker):
+    def __init__(
+        self, cluster_config: ClusterSpecConfig, storage_tracker: GlobalStorageTracker
+    ):
+        self.cluster_config = cluster_config
        self.storage_tracker = storage_tracker

    def derive_plan(
@ -269,8 +272,8 @@ class RedistribPlanner:
        return self._group_bcast_transfers()

    def _on_same_node(self, i, j) -> bool:
-        return (i // cluster_spec.n_gpus_per_node) == (
-            j // cluster_spec.n_gpus_per_node
+        return (i // self.cluster_config.n_gpus_per_node) == (
+            j // self.cluster_config.n_gpus_per_node
        )

    def _select_best_bcast_source(self, source_gpus, target_gpus):
--- a/realhf/system/rollout_worker.py
+++ b/realhf/system/rollout_worker.py
@ -87,7 +87,7 @@ class RolloutWorker(AsyncWorker):
        self.rollout_stat = RolloutStat()

        # recover info
-        self.__recover_run, self.__recover_info = recover.load_recover_info()
+        self.__recover_run, self.__recover_info = recover.load_recover_info(self.args)

        return config.worker_info

@ -101,13 +101,6 @@ class RolloutWorker(AsyncWorker):
                self.worker_index,
                self.worker_count,
                self.config.tokenizer_path,
-                self.config.worker_info.experiment_name,
-                self.config.worker_info.trial_name,
-                cache_root=(
-                    None
-                    if not self.config.use_dataset_cache
-                    else self.config.dataset_cahce_root
-                ),
            )
            for d in self.config.datasets
        ]
@ -129,9 +122,7 @@ class RolloutWorker(AsyncWorker):
        # Recover indices for dynamic dataset
        if hasattr(self.dataset, "filter"):
            dataset_indices_path = os.path.join(
-                constants.MODEL_SAVE_ROOT,
-                constants.experiment_name(),
-                constants.trial_name(),
+                constants.get_log_path(self.args),
                f"dataset_indices_{self.worker_index}.npy",
            )
            if os.path.exists(dataset_indices_path):
@ -156,15 +147,11 @@ class RolloutWorker(AsyncWorker):
            self.is_new_epoch = True
            # Upon the first fetch request, filter dataset and create dataloader.
            eval_scores_path = os.path.join(
-                constants.MODEL_SAVE_ROOT,
-                constants.experiment_name(),
-                constants.trial_name(),
+                constants.get_log_path(self.args),
                "dataset_eval_scores.json",
            )
            dataset_indices_path = os.path.join(
-                constants.MODEL_SAVE_ROOT,
-                constants.experiment_name(),
-                constants.trial_name(),
+                constants.get_log_path(self.args),
                f"dataset_indices_{self.worker_index}.npy",
            )
            if hasattr(self.dataset, "filter") and os.path.exists(eval_scores_path):
--- a/realhf/system/stream_dataset.py
+++ b/realhf/system/stream_dataset.py
@ -2,6 +2,7 @@ import queue
 import sys
 import threading
 import time
+import traceback
 from typing import Any, List, Optional

 from torch.utils.data import ConcatDataset, Dataset
@ -23,6 +24,7 @@ class PullerStreamDataset(Dataset):
    def __init__(
        self,
        util: DatasetUtility,
+        args,
        dataset_cfgs: List[DatasetAbstraction],
        pull_timeout_ms=100,
    ):
@ -35,8 +37,6 @@ class PullerStreamDataset(Dataset):
                dp_rank=util.dp_rank,
                world_size=util.world_size,
                tokenizer_or_tokenizer_name=util.tokenizer,
-                experiment_name=constants.experiment_name(),
-                trial_name=constants.trial_name(),
            )
            for dataset_cfg in dataset_cfgs
        ]
@ -51,6 +51,8 @@ class PullerStreamDataset(Dataset):
        self.data_queue = queue.Queue(maxsize=self.dataset_size * util.world_size)
        self._stop_event = threading.Event()

+        self.args = args
+
        # Pass ZMQ context (thread-safe) and let worker create the socket
        self.util = util
        self.worker_thread = threading.Thread(target=self._pull_data_worker)
@ -60,33 +62,33 @@ class PullerStreamDataset(Dataset):
        """Worker thread that creates its own ZMQ puller and streams data."""
        # Initialize the puller inside the worker thread
        stream = NameResolvingZmqPuller(
-            constants.experiment_name(),
-            constants.trial_name(),
+            self.args,
            puller_index=self.util.dp_rank,
        )
-        try:
-            while not self._stop_event.is_set():
+        processed_data = None
+        while not self._stop_event.is_set():
+            if processed_data is not None:
                try:
-                    data = stream.pull(timeout_ms=self.pull_timeout_ms)
-                    processed_data = [
-                        SequenceSample.from_json_compatible(x) for x in data
-                    ]
-                    logger.debug(
-                        f"Get data {[x.ids[0] for x in processed_data]} from puller stream."
-                    )
-                    self.data_queue.put(processed_data)
-                except queue.Empty:
-                    logger.debug(f"No data from puller stream.")
+                    self.data_queue.put_nowait(processed_data)
+                    processed_data = None
+                except queue.Full:
                    time.sleep(0.1)
                    continue
-        finally:
-            # Ensure socket is closed in the same thread
-            del stream
-            # Exit if this thread has an error
-            sys.exit(1)
+            try:
+                data = stream.pull(timeout_ms=self.pull_timeout_ms)
+                processed_data = [SequenceSample.from_json_compatible(x) for x in data]
+                logger.debug(
+                    f"Get data {[x.ids[0] for x in processed_data]} from puller stream."
+                )
+            except queue.Empty:
+                logger.debug(f"No data from puller stream.")
+                time.sleep(0.1)
+                continue

    def __getitem__(self, idx: int) -> Optional[Any]:
        samples = []
+        if not self.worker_thread.is_alive():
+            raise RuntimeError("Stream dataset puller thread is not alive.")
        while True:
            try:
                samples += self.data_queue.get_nowait()
@ -99,8 +101,6 @@ class PullerStreamDataset(Dataset):

    def __del__(self):
        self._stop_event.set()
-        if self.worker_thread.is_alive():
-            self.worker_thread.join(timeout=1.0)


 register_dataset("puller_stream", PullerStreamDataset)
--- a/realhf/system/worker_base.py
+++ b/realhf/system/worker_base.py
@ -568,6 +568,7 @@ class Worker:
        self.__worker_index = worker_info.worker_index

        experiment = system_api.make_experiment(name=worker_info.experiment_name)
+        self.args = experiment

        expr_config = experiment.initial_setup()
        if isinstance(expr_config, list):
--- a/tests/agent/test_math_single_step_agent.py
+++ b/tests/agent/test_math_single_step_agent.py
@ -1,6 +1,7 @@
 import asyncio
 import json
 import os
+from pathlib import Path
 from unittest.mock import AsyncMock, MagicMock, patch

 import numpy as np
@ -21,7 +22,7 @@ def mock_env():


@pytest.fixture
-def agent_config():
+def agent_config(tmp_path):
    return {
        "gconfig": MagicMock(n=2),
        "tokenizer_path": "/storage/openpsi/models/Qwen__Qwen2.5-0.5B-Instruct/",
@ -29,6 +30,7 @@ def agent_config():
        "success_rate_ub": 1.0,
        "reward_scaling": 2.0,
        "reward_bias": 0.1,
+        "answer_save_path": tmp_path,
    }


@ -140,48 +142,34 @@ async def test_collect_trajectory_empty_act_queue(agent, mock_env, mock_prompt):

 def test_log_rewards_to_file(agent, tmp_path):
    # Setup test directories
-    with (
-        patch("realhf.base.constants.LOG_ROOT", tmp_path),
-        patch("realhf.base.constants.experiment_name", return_value="test_exp"),
-        patch("realhf.base.constants.trial_name", return_value="test_trial"),
-    ):
-        agent.log_rewards_to_file(
-            qid="123",
-            prompt="test_prompt",
-            prompt_len=3,
-            answers=["answer1", "answer2"],
-            seqlens=[5, 6],
-            rewards=[0.5, 0.7],
-            success=[True, False],
-            version_starts=[1, 2],
-            version_ends=[2, 3],
-        )
+    agent.log_rewards_to_file(
+        qid="123",
+        prompt="test_prompt",
+        prompt_len=3,
+        answers=["answer1", "answer2"],
+        seqlens=[5, 6],
+        rewards=[0.5, 0.7],
+        success=[True, False],
+        version_starts=[1, 2],
+        version_ends=[2, 3],
+    )

-        # Check generated file
-        gen_file_path = (
-            tmp_path / "test_exp" / "test_trial" / "generated" / "1" / "123.txt"
-        )
-        assert gen_file_path.exists()
-        with open(gen_file_path) as f:
-            content = f.read()
-            assert "idx: 1 / 2" in content
-            assert "seqlen: 5" in content
-            assert "test_prompt" in content
+    # Check generated file
+    gen_file_path = Path(agent.answer_save_path) / "1" / "123.txt"
+    assert gen_file_path.exists()
+    with open(gen_file_path) as f:
+        content = f.read()
+        assert "idx: 1 / 2" in content
+        assert "seqlen: 5" in content
+        assert "test_prompt" in content

-        # Check monitor file
-        monitor_file_path = (
-            tmp_path
-            / "test_exp"
-            / "test_trial"
-            / "training_monitor"
-            / "1"
-            / "123.jsonl"
-        )
-        assert monitor_file_path.exists()
-        with open(monitor_file_path) as f:
-            data = json.loads(f.readline())
-            assert data["version_start"] == 1
-            assert data["prompt_len"] == 3
+    # Check monitor file
+    monitor_file_path = Path(agent.answer_save_path) / "1" / "123.jsonl"
+    assert monitor_file_path.exists()
+    with open(monitor_file_path) as f:
+        data = json.loads(f.readline())
+        assert data["version_start"] == 1
+        assert data["prompt_len"] == 3


 def test_reward_calculation(agent):
--- a/tests/comm/test_data_transfer.py
+++ b/tests/comm/test_data_transfer.py
@ -14,6 +14,7 @@ import pytest
 import torch
 import torch.distributed as dist

+from realhf.api.cli_args import ClusterSpecConfig
 from realhf.api.core.config import ModelName, ModelShardID
 from realhf.api.core.data_api import SequenceSample
 from realhf.base import constants, testing, topology
@ -166,7 +167,7 @@ def _test_data_transfer(
    data_manager.setup_process_groups()

    storage_tracker = GlobalStorageTracker(dist.get_world_size())
-    planner = RedistribPlanner(storage_tracker)
+    planner = RedistribPlanner(ClusterSpecConfig(), storage_tracker)

    key = "input_ids"

--- a/tests/cpp_extensions/test_grouped_gemm.py
+++ b/tests/cpp_extensions/test_grouped_gemm.py
@ -4,12 +4,12 @@

 import random
 import time
+import uuid

 import pytest
 import torch

-import realhf.base.constants as constants
-import realhf.base.testing as testing
+from realhf.base import constants, name_resolve, testing


 # This is a test for grouped_gemm experts implementation of MoE.
@ -99,10 +99,7 @@ def run_grouped_mlp(num_tokens, tp_size, token_dispatch_strategy, seed=1):
        )


-@pytest.mark.skipif(
-    not torch.cuda.is_available(),
-    reason="This test requires GPU to run",
-)
+@pytest.mark.skip("grouped_gemm is not used for now.")
@pytest.mark.parametrize("num_tokens", [200])
@pytest.mark.parametrize("tp_size", [1, 2])
@pytest.mark.parametrize("token_dispatch_strategy", ["random"])
@ -123,10 +120,7 @@ def test_grouped_mlp(
    test.launch()


-@pytest.mark.skipif(
-    not torch.cuda.is_available(),
-    reason="This test requires GPU to run",
-)
+@pytest.mark.skip("grouped_gemm is not used for now.")
@pytest.mark.gpu
 def test_grouped_gemm():
    torch.manual_seed(1)
--- a/tests/cpp_extensions/test_interval_ops.py
+++ b/tests/cpp_extensions/test_interval_ops.py
@ -36,7 +36,7 @@ def maybe_synchronize_cuda():
        torch.cuda.synchronize()


-@pytest.mark.skipif(not torch.cuda.is_available(), reason="This test requires a GPU.")
+@pytest.mark.skip("interval_ops are not used now.")
@pytest.mark.parametrize(
    "n_intervals", list(reversed([1, 100, 500, 1000, 2000, 4000, 10000, 100000]))
 )
@ -86,7 +86,7 @@ def test_get(n_intervals: int, dtype: torch.dtype):
    )


-@pytest.mark.skipif(not torch.cuda.is_available(), reason="This test requires a GPU.")
+@pytest.mark.skip("interval_ops are not used now.")
@pytest.mark.parametrize(
    "n_intervals", list(reversed([1, 10, 100, 500, 1000, 1000, 10000, 100000]))
 )
--- a/tests/data/test_load_data.py
+++ b/tests/data/test_load_data.py
@ -21,8 +21,6 @@ def _validate_dataset(cfg: config_api.DatasetAbstraction, tokenizer):
        dp_rank=0,
        world_size=1,
        tokenizer_or_tokenizer_name=tokenizer,
-        experiment_name=str(uuid.uuid4()),
-        trial_name=str(uuid.uuid4()),
    )
    dataloader = DataLoader(
        dataset,
--- a/tests/distributed/test_name_resolve.py
+++ b/tests/distributed/test_name_resolve.py
@ -21,13 +21,13 @@ BACKENDS = [
    ("nfs", {}),
    ("ray", {}),
 ]
-if os.environ.get("REAL_ETCD_ADDR"):
+if os.environ.get("TESTING_ETCD_ADDR"):
    BACKENDS.append(
        (
            "etcd3",
            {
-                "host": os.getenv("REAL_ETCD_ADDR").split(":")[0],
-                "port": int(os.getenv("REAL_ETCD_ADDR").split(":")[1]),
+                "host": os.getenv("TESTING_ETCD_ADDR").split(":")[0],
+                "port": int(os.getenv("TESTING_ETCD_ADDR").split(":")[1]),
            },
        )
    )
@ -43,12 +43,9 @@ def name_resolve(request):
        temp_dir = tempfile.mkdtemp()
        from realhf.base.name_resolve import NfsNameRecordRepository

-        original_root = NfsNameRecordRepository.RECORD_ROOT
-        NfsNameRecordRepository.RECORD_ROOT = temp_dir
-        repo = NfsNameRecordRepository()
+        repo = NfsNameRecordRepository(temp_dir)
        yield repo
        repo.reset()
-        NfsNameRecordRepository.RECORD_ROOT = original_root
        shutil.rmtree(temp_dir)
    elif backend_type == "memory":
        from realhf.base.name_resolve import MemoryNameRecordRepository
@ -208,17 +205,6 @@ def test_reset(name_resolve):
    name_resolve.delete("test_key_no_delete")


-def test_context_manager(name_resolve):
-    """Test context manager functionality."""
-    with name_resolve.__class__() as repo:
-        repo.add("test_key", "test_value", delete_on_exit=True)
-        assert repo.get("test_key") == "test_value"
-
-    # Key should be deleted after context exits
-    with pytest.raises(NameEntryNotFoundError):
-        name_resolve.get("test_key")
-
-
 def test_concurrent_access(name_resolve):
    """Test concurrent access to the same key."""
    name_resolve.add("test_key", "initial_value")
@ -648,7 +634,9 @@ def test_corner_case_get_same_as_prefix(name_resolve):
    assert set(keys) == {"prefix", "prefix/child"}


-@pytest.mark.skipif(os.getenv("REAL_ETCD_ADDR") is None, reason="ETCD3 not configured")
+@pytest.mark.skipif(
+    os.getenv("TESTING_ETCD_ADDR") is None, reason="ETCD3 not configured"
+)
 def test_etcd3_specific_features(name_resolve):
    if not isinstance(name_resolve, Etcd3NameRecordRepository):
        pytest.skip("ETCD3 specific test")
@ -663,7 +651,9 @@ def test_etcd3_specific_features(name_resolve):
        name_resolve.get("test_key")


-@pytest.mark.skipif(os.getenv("REAL_ETCD_ADDR") is not None, reason="NFS specific test")
+@pytest.mark.skipif(
+    os.getenv("TESTING_ETCD_ADDR") is not None, reason="NFS specific test"
+)
 def test_nfs_specific_features(name_resolve):
    """Test features specific to NFS backend."""
    from realhf.base.name_resolve import NfsNameRecordRepository
--- a/tests/experiments/test_buffer_recover.py
+++ b/tests/experiments/test_buffer_recover.py
@ -8,6 +8,7 @@ from typing import *
 import pytest

 from realhf.api.cli_args import (
+    ClusterSpecConfig,
    ExperimentSaveEvalControl,
    MFCConfig,
    ModelTrainEvalConfig,
@ -15,7 +16,7 @@ from realhf.api.cli_args import (
    PromptAnswerDatasetConfig,
    PromptOnlyDatasetConfig,
 )
-from realhf.base import cluster, logging, name_resolve, testing
+from realhf.base import logging, name_resolve, testing
 from realhf.experiments.common.null_exp import NullPPOConfig, NullSFTConfig
 from tests.experiments.utils import run_test_exp
 from tests.fixtures import *
@ -64,11 +65,8 @@ def test_buffer_recover(
 ):
    _, dataset_size = math_code_dataset_with_size
    # Setup experiment env. Should be done before any other operations.
-    log_root = tmp_path_factory.mktemp("buffer-recover")
-    cluster.spec.fileroot = str(log_root)
    expr_name = str(uuid.uuid4())
    trial_name = str(uuid.uuid4())
-    testing.clear_name_resolve(expr_name, trial_name)
    constants.set_experiment_trial_names(expr_name, trial_name)

    exp_cfg = NullPPOConfig(
@ -114,6 +112,10 @@ def test_buffer_recover(
            save_freq_steps=2,
            benchmark_steps=0,
        ),
+        cluster=ClusterSpecConfig(
+            fileroot=str(tmp_path_factory.mktemp("buffer-recover")),
+            n_gpus_per_node=16,
+        ),
    )

    os.environ["REAL_SAVE_RECOVER_STATES"] = "1"
--- a/tests/experiments/test_math_ppo.py
+++ b/tests/experiments/test_math_ppo.py
@ -8,6 +8,7 @@ from typing import *
 import pytest

 from realhf.api.cli_args import (
+    ClusterSpecConfig,
    ExperimentSaveEvalControl,
    GenerationHyperparameters,
    MFCConfig,
@ -17,7 +18,7 @@ from realhf.api.cli_args import (
    PPOHyperparameters,
    PromptOnlyDatasetConfig,
 )
-from realhf.base import cluster, testing
+from realhf.base import testing
 from realhf.experiments.common.ppo_math_exp import PPOMATHConfig
 from tests.experiments.utils import run_test_exp
 from tests.fixtures import *
@ -73,8 +74,6 @@ def test_ppo_symm(
    mp,
 ):
    # Setup experiment env. Should be done before any other operations.
-    log_root = tmp_path_factory.mktemp("ppo")
-    cluster.spec.fileroot = str(log_root)
    constants.set_experiment_trial_names(
        testing._DEFAULT_EXPR_NAME, testing._DEFAULT_TRIAL_NAME
    )
@ -117,6 +116,7 @@ def test_ppo_symm(
            ),
        ),
        group_size=2,
+        cluster=ClusterSpecConfig(fileroot=str(tmp_path_factory.mktemp("ppo"))),
    )

    run_test_exp(exp_cfg)
@ -152,8 +152,6 @@ def test_ppo_decoupled(
    gmp,
 ):
    # Setup experiment env. Should be done before any other operations.
-    log_root = tmp_path_factory.mktemp("ppo")
-    cluster.spec.fileroot = str(log_root)
    constants.set_experiment_trial_names(
        testing._DEFAULT_EXPR_NAME, testing._DEFAULT_TRIAL_NAME
    )
@ -245,6 +243,7 @@ def test_ppo_decoupled(
            ),
        ),
        group_size=2,
+        cluster=ClusterSpecConfig(fileroot=str(tmp_path_factory.mktemp("ppo"))),
    )

    run_test_exp(exp_cfg)
@ -275,8 +274,6 @@ def test_ppo_global_reshard(
    rew_inf,
 ):
    # Setup experiment env. Should be done before any other operations.
-    log_root = tmp_path_factory.mktemp("ppo-global-reshard")
-    cluster.spec.fileroot = str(log_root)
    constants.set_experiment_trial_names(
        testing._DEFAULT_EXPR_NAME, testing._DEFAULT_TRIAL_NAME
    )
@ -368,6 +365,7 @@ def test_ppo_global_reshard(
                pipeline_parallel_size=critic_train[2],
            ),
        ),
+        cluster=ClusterSpecConfig(fileroot=str(tmp_path_factory.mktemp("ppo"))),
    )
    run_test_exp(exp_cfg)

@ -388,8 +386,6 @@ def test_ppo_param_realloc_sub_device_mesh(
    critic_inf,
 ):
    # Setup experiment env. Should be done before any other operations.
-    log_root = tmp_path_factory.mktemp("ppo-submesh")
-    cluster.spec.fileroot = str(log_root)
    constants.set_experiment_trial_names(
        testing._DEFAULT_EXPR_NAME, testing._DEFAULT_TRIAL_NAME
    )
@ -484,6 +480,7 @@ def test_ppo_param_realloc_sub_device_mesh(
                pipeline_parallel_size=2,
            ),
        ),
+        cluster=ClusterSpecConfig(fileroot=str(tmp_path_factory.mktemp("ppo"))),
    )

    run_test_exp(exp_cfg)
@ -503,13 +500,9 @@ def test_ppo_save(
    bs,
 ):
    # Setup experiment env. Should be done before any other operations.
-    log_root = tmp_path_factory.mktemp("ppo")
-    cluster.spec.fileroot = str(log_root)
    constants.set_experiment_trial_names(
        testing._DEFAULT_EXPR_NAME, testing._DEFAULT_TRIAL_NAME
    )
-    shutil.rmtree(constants.MODEL_SAVE_ROOT, ignore_errors=True)
-    os.makedirs(constants.MODEL_SAVE_ROOT, exist_ok=True)

    total_train_epochs = 3

@ -604,7 +597,11 @@ def test_ppo_save(
                pipeline_parallel_size=1,
            ),
        ),
+        cluster=ClusterSpecConfig(fileroot=str(tmp_path_factory.mktemp("ppo"))),
    )
+    shutil.rmtree(constants.get_save_path(exp_cfg), ignore_errors=True)
+    os.makedirs(constants.get_save_path(exp_cfg), exist_ok=True)
+
    exp_cfg.actor.vllm.hybrid_train = True
    exp_cfg.actor.vllm.enforce_eager = True

@ -636,9 +633,7 @@ def test_ppo_save(
            int(os.path.basename(f).split("globalstep")[-1])
            for f in os.listdir(
                os.path.join(
-                    constants.MODEL_SAVE_ROOT,
-                    testing._DEFAULT_EXPR_NAME,
-                    testing._DEFAULT_EXPR_NAME,
+                    constants.get_save_path(exp_cfg),
                    model_name,
                )
            )
--- a/tests/experiments/test_sft.py
+++ b/tests/experiments/test_sft.py
@ -6,11 +6,12 @@ from typing import *
 import pytest

 from realhf.api.cli_args import (
+    ClusterSpecConfig,
    ExperimentSaveEvalControl,
    ModelTrainEvalConfig,
    PromptAnswerDatasetConfig,
 )
-from realhf.base import cluster, testing
+from realhf.base import testing
 from realhf.experiments.common.sft_exp import SFTConfig
 from tests.experiments.utils import run_test_exp
 from tests.fixtures import *
@ -32,9 +33,6 @@ def model_class(request):
        (1, 2, 4),
        (2, 4, 1),
        (2, 1, 4),
-        (4, 2, 2),
-        (2, 4, 2),
-        (2, 2, 4),
    ],
 )
 def test_sft_xl(tmp_path_factory, tokenizer, save_path, cpu_hf_model, dp, pp, tp):
@ -61,8 +59,6 @@ def test_sft_xl(tmp_path_factory, tokenizer, save_path, cpu_hf_model, dp, pp, tp
 def test_sft(tmp_path_factory, tokenizer, save_path, cpu_hf_model, dp, pp, tp):

    # Setup experiment env. Should be done before any other operations.
-    log_root = tmp_path_factory.mktemp("sft")
-    cluster.spec.fileroot = str(log_root)
    constants.set_experiment_trial_names(
        testing._DEFAULT_EXPR_NAME, testing._DEFAULT_TRIAL_NAME
    )
@ -89,6 +85,7 @@ def test_sft(tmp_path_factory, tokenizer, save_path, cpu_hf_model, dp, pp, tp):
            valid_bs_n_seqs=minbs,
            fill_to_max_length=False,
        ),
+        cluster=ClusterSpecConfig(fileroot=str(tmp_path_factory.mktemp("sft"))),
    )

    run_test_exp(exp_cfg)
--- a/tests/experiments/utils.py
+++ b/tests/experiments/utils.py
@ -1,4 +1,5 @@
 # Copyright 2025 Ant Group Inc.
+import asyncio
 import functools
 import multiprocessing as mp
 from typing import *
@ -6,7 +7,7 @@ from typing import *
 import pytest

 from realhf.api.core.system_api import Experiment, register_experiment
-from realhf.base import cluster, constants, logging, testing
+from realhf.base import constants, logging, testing
 from realhf.system.worker_base import WorkerServerStatus
 from tests.fixtures import *

@ -74,7 +75,6 @@ def run_test_exp(
    logger.info("Configuring master worker...")
    mas.configure(setup_id=0, worker_info=exp_setup.master_worker[0].worker_info)
    logger.info("Configuring master worker... Done.")
-    initd = False

    # Run model workers in subprocesses
    barrier = mp.Barrier(len(exp_setup.model_worker))
@ -98,13 +98,17 @@ def run_test_exp(
    testcase.start()

    # Run the master worker.
-    for _ in range(int(1e4)):
-        if mas.status == WorkerServerStatus.PAUSED:
-            break
-        if not initd:
-            logger.info("Running master worker lazy initialization...")
-        mas._poll()
-        if not initd:
-            logger.info("Running master worker lazy initialization... Done.")
-            initd = True
+    async def run_master_worker():
+        initd = False
+        for _ in range(int(1e4)):
+            if mas.status == WorkerServerStatus.PAUSED:
+                break
+            if not initd:
+                logger.info("Running master worker lazy initialization...")
+            await mas._poll_async()
+            if not initd:
+                logger.info("Running master worker lazy initialization... Done.")
+                initd = True
+
+    asyncio.run(run_master_worker())
    testcase.wait(timeout=0.1)
--- a/tests/interfaces/test_multi_task_reward.py
+++ b/tests/interfaces/test_multi_task_reward.py
@ -16,7 +16,7 @@ from realhf.api.core.data_api import (
    load_hf_tokenizer,
 )
 from realhf.api.core.model_api import FinetuneSpec, Model
-from realhf.base import constants, network, testing
+from realhf.base import constants, name_resolve, network, testing
 from tests.fixtures import *


@ -61,8 +61,12 @@ def math_code_dataset(request, save_path):
    "tokenizer_path", ["/storage/openpsi/models/Qwen__Qwen2-1.5B-Instruct/"]
 )
 def test_multi_task_reward_interface(save_path, tokenizer_path, math_code_dataset):
+    from realhf.api.cli_args import NameResolveConfig
    from realhf.impl.dataset.math_code_dataset import MATHCodePromptDataset

+    name_resolve.reconfigure(
+        NameResolveConfig("nfs", f"/tmp/areal/{str(uuid.uuid4())}/")
+    )
    dist.init_process_group(
        rank=0, world_size=1, init_method=f"tcp://localhost:{network.find_free_port()}"
    )
--- a/tests/system/test_gserver_manager.py
+++ b/tests/system/test_gserver_manager.py
@ -13,7 +13,9 @@ from typing import Optional
 import aiohttp
 import pytest

+from realhf.api.cli_args import BaseExperimentConfig, NameResolveConfig
 from realhf.api.core.config import ModelName
+from realhf.api.core.system_api import ExpStatus
 from realhf.api.core.system_api import GserverManager as GserverManagerConfig
 from realhf.api.core.system_api import WorkerInformation
 from realhf.base import constants, name_resolve, names, network, testing
@ -42,6 +44,9 @@ def mock_servers():
    from fastapi import FastAPI
    from fastapi.responses import ORJSONResponse, PlainTextResponse

+    name_resolve.reconfigure(
+        NameResolveConfig("nfs", "/tmp/areal/test-gserver-manager")
+    )
    ports = network.find_multiple_free_ports(N_SERVERS)

    # Create mock server responses
@ -124,7 +129,6 @@ def mock_servers():
@pytest.fixture
 def gserver_manager(request, mock_servers):
    train_batch_size, offpolicyness = request.param
-    testing.clear_name_resolve()
    constants.set_experiment_trial_names(
        testing._DEFAULT_EXPR_NAME, testing._DEFAULT_TRIAL_NAME
    )
@ -135,6 +139,9 @@ def gserver_manager(request, mock_servers):
    name_resolve.add_subentry(name, server_urls[0])
    name_resolve.add_subentry(name, server_urls[1])

+    name = names.experiment_status(constants.experiment_name(), constants.trial_name())
+    name_resolve.add(name, ExpStatus.RUNNING)
+
    # Mock requests.get for metrics endpoint
    m = GserverManager()
    config = GserverManagerConfig(
@ -153,6 +160,7 @@ def gserver_manager(request, mock_servers):
            worker_index=0,
        ),
    )
+    m.args = BaseExperimentConfig()
    m._configure(config)
    # launch the server
    m._poll()
--- a/tests/system/test_partial_rollout.py
+++ b/tests/system/test_partial_rollout.py
@ -6,6 +6,7 @@ from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
 from transformers import PreTrainedTokenizerFast

+from realhf.api.cli_args import NameResolveConfig
 from realhf.api.core.model_api import (
    APIGenerateInput,
    APIGenerateOutput,
@ -67,6 +68,10 @@ def partial_rollout_manager():
    request_queue = asyncio.Queue()
    reply_queue = asyncio.Queue()

+    name_resolve.reconfigure(
+        NameResolveConfig("nfs", "/tmp/areal/test-partial-rollout")
+    )
+
    testing.clear_name_resolve()
    constants.set_experiment_trial_names(
        testing._DEFAULT_EXPR_NAME, testing._DEFAULT_TRIAL_NAME
--- a/tests/system/test_stream_dataset.py
+++ b/tests/system/test_stream_dataset.py
@ -7,9 +7,10 @@ import pytest
 import torch
 from torch.utils.data import DataLoader

+from realhf.api.cli_args import BaseExperimentConfig, NameResolveConfig
 from realhf.api.core import config as config_api
 from realhf.api.core import data_api
-from realhf.base import constants, testing
+from realhf.base import constants, name_resolve, testing
 from tests.fixtures import *


@ -64,6 +65,7 @@ def test_load_stream_dataset(prompt_dataset_cfg, tokenizer, mock_puller):
        testing._DEFAULT_EXPR_NAME, testing._DEFAULT_TRIAL_NAME
    )

+    name_resolve.reconfigure(NameResolveConfig("nfs", "/tmp/areal/test-stream-dataset"))
    testing.clear_name_resolve()

    util = data_api.DatasetUtility(
@ -74,7 +76,12 @@ def test_load_stream_dataset(prompt_dataset_cfg, tokenizer, mock_puller):
    )

    # Test initialization
-    dataset = PullerStreamDataset(util, prompt_dataset_cfg, pull_timeout_ms=100)
+    dataset = PullerStreamDataset(
+        util,
+        args=BaseExperimentConfig(),
+        dataset_cfgs=prompt_dataset_cfg,
+        pull_timeout_ms=100,
+    )
    assert len(dataset) > 0  # Should have non-zero size from prompt dataset
    assert dataset.data_queue.empty()

@ -91,12 +98,14 @@ def test_load_stream_dataset(prompt_dataset_cfg, tokenizer, mock_puller):
    assert len(items1) + len(items2) > 0

    # Test cleanup
+    dataset._stop_event.set()
    del dataset


 def test_puller_stream_dataset_timeout(prompt_dataset_cfg, tokenizer):
    from realhf.system.stream_dataset import PullerStreamDataset

+    name_resolve.reconfigure(NameResolveConfig("nfs", "/tmp/areal/test-stream-dataset"))
    testing.clear_name_resolve()

    util = data_api.DatasetUtility(
@ -109,15 +118,19 @@ def test_puller_stream_dataset_timeout(prompt_dataset_cfg, tokenizer):
    with patch("realhf.system.stream_dataset.NameResolvingZmqPuller") as mock_puller:
        mock_puller.return_value.pull.side_effect = queue.Empty()

-        dataset = PullerStreamDataset(util, prompt_dataset_cfg, pull_timeout_ms=10)
+        dataset = PullerStreamDataset(
+            util, BaseExperimentConfig(), prompt_dataset_cfg, pull_timeout_ms=10
+        )
        # Should handle timeout gracefully
        assert dataset[0] == []
+        dataset._stop_event.set()
        del dataset


 def test_puller_stream_dataset_stop_event(prompt_dataset_cfg, tokenizer, mock_puller):
    from realhf.system.stream_dataset import PullerStreamDataset

+    name_resolve.reconfigure(NameResolveConfig("nfs", "/tmp/areal/test-stream-dataset"))
    testing.clear_name_resolve()

    util = data_api.DatasetUtility(
@ -127,7 +140,7 @@ def test_puller_stream_dataset_stop_event(prompt_dataset_cfg, tokenizer, mock_pu
        tokenizer=tokenizer,
    )

-    dataset = PullerStreamDataset(util, prompt_dataset_cfg)
+    dataset = PullerStreamDataset(util, BaseExperimentConfig(), prompt_dataset_cfg)
    assert not dataset._stop_event.is_set()

    # Trigger stop event and verify thread stops
@ -140,6 +153,7 @@ def test_puller_stream_dataset_stop_event(prompt_dataset_cfg, tokenizer, mock_pu
 def test_puller_stream_dataset_worker_thread_exception(prompt_dataset_cfg, tokenizer):
    from realhf.system.stream_dataset import PullerStreamDataset

+    name_resolve.reconfigure(NameResolveConfig("nfs", "/tmp/areal/test-stream-dataset"))
    testing.clear_name_resolve()

    util = data_api.DatasetUtility(
@ -152,7 +166,7 @@ def test_puller_stream_dataset_worker_thread_exception(prompt_dataset_cfg, token
    with patch("realhf.system.stream_dataset.NameResolvingZmqPuller") as mock_puller:
        mock_puller.return_value.pull.side_effect = Exception("Test error")

-        dataset = PullerStreamDataset(util, prompt_dataset_cfg)
+        dataset = PullerStreamDataset(util, BaseExperimentConfig(), prompt_dataset_cfg)
        time.sleep(0.1)  # Give thread time to crash
        assert not dataset.worker_thread.is_alive()
        del dataset
--- a/training/main_async_ppo.py
+++ b/training/main_async_ppo.py
@ -8,7 +8,6 @@ import yaml
 from omegaconf import MISSING, OmegaConf

 from realhf.api.quickstart.entrypoint import kind_reminder
-from realhf.base.constants import init_constants
 from realhf.experiments.async_exp.async_ppo_math_exp import AsyncPPOMATHConfig
 from training.utils import run_experiment

@ -37,14 +36,10 @@ def main_ppo_math(args):
    if args.mode != "ray":
        raise RuntimeError("This script only supports the `ray` mode.")

-    init_constants(args)
-
-    from realhf.base.constants import LOG_ROOT
+    from realhf.base.constants import get_log_path

    # Save overwritten configuration to yaml
-    config_save_path = os.path.join(
-        LOG_ROOT, args.experiment_name, args.trial_name, "config.yaml"
-    )
+    config_save_path = os.path.join(get_log_path(args), "config.yaml")
    os.makedirs(os.path.dirname(config_save_path), exist_ok=True)
    with open(config_save_path, "w") as f:
        config_dict: Dict = dataclasses.asdict(args)
--- a/training/main_sft.py
+++ b/training/main_sft.py
@ -8,7 +8,6 @@ import yaml
 from omegaconf import MISSING, OmegaConf

 from realhf.api.quickstart.entrypoint import kind_reminder
-from realhf.base.constants import init_constants
 from realhf.experiments.common.sft_exp import SFTConfig
 from training.utils import run_experiment

@ -37,14 +36,10 @@ def main(args):
    if args.mode != "ray":
        raise RuntimeError("This script only supports the `ray` mode.")

-    init_constants(args)
-
-    from realhf.base.constants import LOG_ROOT
+    from realhf.base.constants import get_log_path

    # Save overwritten configuration to yaml
-    config_save_path = os.path.join(
-        LOG_ROOT, args.experiment_name, args.trial_name, "config.yaml"
-    )
+    config_save_path = os.path.join(get_log_path(args), "config.yaml")
    os.makedirs(os.path.dirname(config_save_path), exist_ok=True)
    with open(config_save_path, "w") as f:
        config_dict: Dict = dataclasses.asdict(args)
--- a/training/main_sync_ppo.py
+++ b/training/main_sync_ppo.py
@ -8,7 +8,6 @@ import yaml
 from omegaconf import MISSING, OmegaConf

 from realhf.api.quickstart.entrypoint import kind_reminder
-from realhf.base.constants import init_constants
 from realhf.experiments.common.ppo_math_exp import PPOMATHConfig
 from training.utils import run_experiment

@ -37,14 +36,10 @@ def main(args):
    if args.mode != "ray":
        raise RuntimeError("This script only supports the `ray` mode.")

-    init_constants(args)
-
-    from realhf.base.constants import LOG_ROOT
+    from realhf.base.constants import get_log_path

    # Save overwritten configuration to yaml
-    config_save_path = os.path.join(
-        LOG_ROOT, args.experiment_name, args.trial_name, "config.yaml"
-    )
+    config_save_path = os.path.join(get_log_path(args), "config.yaml")
    os.makedirs(os.path.dirname(config_save_path), exist_ok=True)
    with open(config_save_path, "w") as f:
        config_dict: Dict = dataclasses.asdict(args)
--- a/training/utils.py
+++ b/training/utils.py
@ -12,6 +12,7 @@ from typing import Any, List
 import psutil
 import ray

+from realhf.api.cli_args import NameResolveConfig
 from realhf.api.core.system_api import Experiment, ExperimentScheduling, TasksGroup
 from realhf.base import constants, logging, name_resolve, names
 from realhf.system import WORKER_TYPES, load_worker
@ -64,6 +65,7 @@ class RayWorker:

    def __init__(
        self,
+        args,
        worker_type: str,
        worker_cls,
        kv_store_name,
@ -74,15 +76,16 @@ class RayWorker:

        os.environ["REAL_MODE"] = "RAY"

-        name_resolve.reconfigure("ray", actor_name=kv_store_name)
+        name_recolve_config = NameResolveConfig("ray", ray_actor_name=kv_store_name)
+        name_resolve.reconfigure(name_recolve_config)
        self.worker: Worker | AsyncWorker = worker_cls()
        self.worker_type = worker_type
+        self.args = args

    def __repr__(self):
        return "".join([c.capitalize() for c in self.worker_type.split("_")])

    def configure(self, cfg: Any, expr_config: Any):
-        constants.init_constants(expr_config)

        worker_info = cfg.worker_info
        idx = worker_info.worker_index
@ -92,6 +95,7 @@ class RayWorker:
        self.worker.wandb_config = expr_config.wandb
        self.worker.swanlab_config = expr_config.swanlab
        self.worker.tensorboard_config = expr_config.tensorboard
+        self.worker.args = self.args
        self.logger = logging.getLogger(f"{self.worker_type} {idx}", "benchmark")
        self.logger.info(f"Configuring {self.worker_type}...")
        self.worker._configure(cfg)
@ -125,6 +129,7 @@ def _run_experiment(exp_cfg, expr_name, trial_name):

    # Initialize ray in the Ray cluster
    env_vars = constants.get_env_vars(
+        exp_cfg,
        WADNB_MODE=exp_cfg.wandb.mode,
        SWANLAB_MODE=exp_cfg.swanlab.mode,
        REAL_MODE="ray",
@ -145,7 +150,8 @@ def _run_experiment(exp_cfg, expr_name, trial_name):
    logger.info("Ray initialized! Ready to run workers.")

    ray_kv_store_name = f"{expr_name}/{trial_name}/ray_kv_store"
-    name_resolve.reconfigure("ray", actor_name=ray_kv_store_name)
+    name_recolve_config = NameResolveConfig("ray", ray_actor_name=ray_kv_store_name)
+    name_resolve.reconfigure(name_recolve_config)

    name_resolve.clear_subtree(
        names.trial_root(experiment_name=expr_name, trial_name=trial_name)
@ -210,6 +216,7 @@ def _run_experiment(exp_cfg, expr_name, trial_name):
                    num_gpus=sch.scheduling.gpu,
                    memory=sch.scheduling.mem * 1024**2,
                ).remote(
+                    args=exp_cfg,
                    worker_type=worker_type,
                    worker_cls=load_worker(worker_type),
                    kv_store_name=ray_kv_store_name,
@ -239,7 +246,7 @@ def _run_experiment(exp_cfg, expr_name, trial_name):
        run_jobs = []
        for worker_type in all_workers:
            workers = all_workers[worker_type]
-            if worker_type in ["master_worker", "rollout_worker", "gserver_manager"]:
+            if worker_type in ["master_worker", "rollout_worker"]:
                # Only the rollout worker is asynchronous
                jobs = [w.run_async.remote() for w in workers]
            else:
@ -270,7 +277,7 @@ class DualOutput:


 def run_experiment(exp_cfg, expr_name, trial_name):
-    log_path = os.path.join(constants.LOG_ROOT, expr_name, trial_name, "main.log")
+    log_path = os.path.join(constants.get_log_path(exp_cfg), "main.log")
    with open(log_path, "a") as f:
        # Create dual output handler
        dual_out = DualOutput(f, sys.stdout)