From bb14f022dc0ae2b1e8501256447318e76c1be344 Mon Sep 17 00:00:00 2001 From: xichengpro Date: Mon, 16 Jun 2025 19:51:31 +0800 Subject: [PATCH] Support using SwanLab for experiment tracking (#98) * Support using SwanLab for experiment tracking * docs: improve WandB and SwanLab integration documentation - Added official links for better user reference - Used backticks to quote commands and parameters - Unified mode settings to use "online" / "cloud" convention - Merged WandB and SwanLab descriptions into a single concise statement - Added note on using `swanlab.mode="local"` when server connection is unavailable * refactor: update default value of api_key * fix: correct help description from WandB to SwanLab in SwanLabConfig * refactor: merge log_swanlab_tensorboard and log_wandb_tensorboard into log_swanlab_wandb_tensorboard - Unified logging logic for SwanLab, WandB, and TensorBoard to reduce code duplication * chore: update swanlab version in dependency config files - Updated SwanLab version in pyproject.toml - Updated SwanLab version in requirements.txt * refactor: enhance SwanLab config handling for logging purposes - Config now uses provided arguments first - Falls back to reading from config.yaml if no input is given * docs: add note on using when server connection is unavailable * refactor: merge _LATEST_WANDB_STEP and _LATEST_SWANLAB_STEP into _LATEST_LOG_STEP * Format code with black and isort * chore: update swanlab version in dependency config files - Updated SwanLab version in requirements.txt * refactor: rename swanlab_wandb_data to log_data --------- Co-authored-by: dubingnan --- docs/tutorial/quickstart.md | 5 +- evaluation/requirements.txt | 1 + pyproject.toml | 1 + realhf/api/cli_args.py | 16 +++++- realhf/api/core/system_api.py | 2 + realhf/apps/main.py | 2 +- realhf/base/logging.py | 26 +++++++--- realhf/experiments/async_exp/async_rl_exp.py | 1 + realhf/experiments/common/common.py | 1 + realhf/experiments/common/ppo_math_exp.py | 1 + realhf/scheduler/evaluator.py | 52 ++++++++++++++++++-- realhf/system/master_worker.py | 38 +++++++++++++- realhf/system/model_function_call.py | 8 +-- realhf/system/worker_base.py | 2 + requirements.txt | 3 +- training/configs/async-ppo.yaml | 7 +++ training/configs/sft.yaml | 7 +++ training/configs/sync-ppo.yaml | 7 +++ training/utils.py | 2 + 19 files changed, 160 insertions(+), 22 deletions(-) diff --git a/docs/tutorial/quickstart.md b/docs/tutorial/quickstart.md index b871cd7..899b377 100644 --- a/docs/tutorial/quickstart.md +++ b/docs/tutorial/quickstart.md @@ -97,12 +97,15 @@ python3 training/main_sync_ppo.py --help ## Monitoring the Training Process -We recommend using Weights & Biases (wandb) for monitoring. Run `wandb login` or set the `WANDB_API_KEY` environment variable. Set `wandb.mode=online` in your configuration to upload training statistics. ++ We recommend using [Weights & Biases (wandb)](https://github.com/wandb/wandb) or [SwanLab](https://github.com/SwanHubX/SwanLab) for monitoring—run `wandb login` or `swanlab login`, or set the corresponding environment variable API key (`WANDB_API_KEY` or `SWANLAB_API_KEY`). Set `wandb.mode="online"` or `swanlab.mode="cloud"` in your configuration to upload training statistics. If you cannot connect to the server, you can also use `wandb.mode="offline"` or `swanlab.mode="local"` to save data locally without uploading. + You can also use TensorBoard by setting the `tensorboard.path` parameter. The main log will be saved to `${fileroot}/logs/${USER}/${experiment_name}/${trial_name}/main.log` and contains the statistics uploaded to wandb. +If SwanLab is enabled, logs will be saved to the directory specified by `swanlab.logdir`. + ### Key Training Statistics - **`Epoch 1/5`**: Indicates the total epochs required and the current epoch being trained. diff --git a/evaluation/requirements.txt b/evaluation/requirements.txt index 3b211d4..7f8c6cb 100644 --- a/evaluation/requirements.txt +++ b/evaluation/requirements.txt @@ -15,3 +15,4 @@ prettytable timeout-decorator timeout_decorator wandb +swanlab[dashboard] \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 2433b17..7597e13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,7 @@ dependencies = [ "colorlog", "psutil", "pynvml", + "swanlab[dashboard]", # Performance and compression "ninja", diff --git a/realhf/api/cli_args.py b/realhf/api/cli_args.py index 22ebd08..8c4cb28 100644 --- a/realhf/api/cli_args.py +++ b/realhf/api/cli_args.py @@ -848,6 +848,16 @@ class WandBConfig: config: Optional[Dict] = None +@dataclass +class SwanlabConfig: + project: Optional[str] = None + name: Optional[str] = None + config: Optional[Dict] = None + logdir: Optional[str] = None + mode: Optional[str] = "local" + api_key: Optional[str] = os.getenv("SWANLAB_API_KEY", None) + + @dataclass class TensorBoardConfig: path: Optional[str] = None @@ -951,6 +961,10 @@ class BaseExperimentConfig: default_factory=WandBConfig, metadata={"help": "Weights & Biases configuration."}, ) + swanlab: SwanlabConfig = field( + default_factory=SwanlabConfig, + metadata={"help": "SwanLab configuration."}, + ) tensorboard: TensorBoardConfig = field( default_factory=TensorBoardConfig, metadata={"help": "TensorBoard configuration. Only 'path' field required."}, @@ -1026,7 +1040,7 @@ class BaseExperimentConfig: default=False, metadata={ "help": "Enable automatic evaluation during training. " - "Results logged to disk and WandB (if active)." + "Results logged to disk and WandB or Swanlab(if active)." }, ) auto_eval_config: AutomaticEvaluator = field( diff --git a/realhf/api/core/system_api.py b/realhf/api/core/system_api.py index 05825d7..6dda819 100644 --- a/realhf/api/core/system_api.py +++ b/realhf/api/core/system_api.py @@ -11,6 +11,7 @@ import realhf.api.core.dfg as dfg from realhf.api.cli_args import ( AutomaticEvaluator, ExperimentSaveEvalControl, + SwanlabConfig, TensorBoardConfig, WandBConfig, ) @@ -254,6 +255,7 @@ class ExperimentScheduling: class ExperimentConfig: exp_ctrl: ExperimentSaveEvalControl wandb: WandBConfig + swanlab: SwanlabConfig tensorboard: TensorBoardConfig # dataflow model_rpcs: List[dfg.MFCDef] diff --git a/realhf/apps/main.py b/realhf/apps/main.py index d44d32c..146655e 100644 --- a/realhf/apps/main.py +++ b/realhf/apps/main.py @@ -94,7 +94,7 @@ def main_start(args, job_group_id: str = "", recover_count: int = 0): raise RuntimeError("Experiment initial setup failed.") from e evaluator = ( - AutomaticEvaluator(exp_cfg.evaluator, exp_cfg.wandb) + AutomaticEvaluator(exp_cfg.evaluator, exp_cfg.wandb, exp_cfg.swanlab) if exp_cfg.auto_eval else None ) diff --git a/realhf/base/logging.py b/realhf/base/logging.py index 7b46ea9..4f21e47 100644 --- a/realhf/base/logging.py +++ b/realhf/base/logging.py @@ -141,19 +141,29 @@ def getLogger( return logging.getLogger(name) -_LATEST_WANDB_STEP = 0 +_LATEST_LOG_STEP = 0 -def log_wandb_tensorboard(data, step=None, summary_writer=None): +def log_swanlab_wandb_tensorboard(data, step=None, summary_writer=None): + # Logs data to SwanLab、 wandb、 TensorBoard. + + global _LATEST_LOG_STEP + if step is None: + step = _LATEST_LOG_STEP + else: + _LATEST_LOG_STEP = max(_LATEST_LOG_STEP, step) + + # swanlab + import swanlab + + swanlab.log(data, step=step) + + # wandb import wandb - global _LATEST_WANDB_STEP - if step is None: - step = _LATEST_WANDB_STEP - else: - _LATEST_WANDB_STEP = max(_LATEST_WANDB_STEP, step) - wandb.log(data, step=step) + + # tensorboard if summary_writer is not None: for key, val in data.items(): summary_writer.add_scalar(f"{key}", val, step) diff --git a/realhf/experiments/async_exp/async_rl_exp.py b/realhf/experiments/async_exp/async_rl_exp.py index 5caf25b..b1a2252 100755 --- a/realhf/experiments/async_exp/async_rl_exp.py +++ b/realhf/experiments/async_exp/async_rl_exp.py @@ -331,6 +331,7 @@ class AsyncRLExperimentConfig(CommonExperimentConfig, AsyncRLOptions): return ExperimentConfig( exp_ctrl=self.exp_ctrl, wandb=self.wandb, + swanlab=self.swanlab, tensorboard=self.tensorboard, # NOTE: master and model worker only see RPCs without generation model_rpcs=[ diff --git a/realhf/experiments/common/common.py b/realhf/experiments/common/common.py index 4ff9a63..ef47549 100644 --- a/realhf/experiments/common/common.py +++ b/realhf/experiments/common/common.py @@ -564,6 +564,7 @@ class CommonExperimentConfig(BaseExperimentConfig, Experiment): return ExperimentConfig( exp_ctrl=self.exp_ctrl, wandb=self.wandb, + swanlab=self.swanlab, tensorboard=self.tensorboard, model_rpcs=[rpc_alloc.rpc for rpc_alloc in rpc_allocs], model_worker=model_worker, diff --git a/realhf/experiments/common/ppo_math_exp.py b/realhf/experiments/common/ppo_math_exp.py index e3ef786..9b8810e 100644 --- a/realhf/experiments/common/ppo_math_exp.py +++ b/realhf/experiments/common/ppo_math_exp.py @@ -370,6 +370,7 @@ class PPOMATHConfig(CommonExperimentConfig, PPOMATHExperimentOptions): return ExperimentConfig( exp_ctrl=self.exp_ctrl, wandb=self.wandb, + swanlab=self.swanlab, tensorboard=self.tensorboard, model_rpcs=[rpc_alloc.rpc for rpc_alloc in rpc_allocs], model_worker=model_worker, diff --git a/realhf/scheduler/evaluator.py b/realhf/scheduler/evaluator.py index a24029e..7c5fef0 100644 --- a/realhf/scheduler/evaluator.py +++ b/realhf/scheduler/evaluator.py @@ -8,6 +8,7 @@ import subprocess import time from typing import Dict, Optional +import swanlab import wandb import realhf.api.core.system_api as config_pkg @@ -125,13 +126,15 @@ class EvaluationStep: self.status = EvaluationStepStatus.FAILED return False - wandb_data = {} + log_data = {} for data_name, d in data.items(): for k, v in d.items(): - wandb_data[f"{data_name}_{k}"] = v - wandb.log(wandb_data, step=self.global_step) + log_data[f"{data_name}_{k}"] = v + wandb.log(log_data, step=self.global_step) + swanlab.log(log_data, step=self.global_step) self.status = EvaluationStepStatus.LOGGED - logger.info(f"Logging eval result {wandb_data} to step {self.global_step}") + logger.info(f"Logging eval result {log_data} to step {self.global_step}") + return True def check(self): @@ -154,13 +157,15 @@ class AutomaticEvaluator: self, config: config_pkg.AutomaticEvaluator, wandb_config: config_pkg.WandBConfig, + swanlab_config: config_pkg.SwanlabConfig, ): self.__eval_steps: Dict[int, EvaluationStep] = {} self.__max_concurrent_jobs = config.max_concurrent_jobs self.__wandb_config = wandb_config + self.__swanlab_config = swanlab_config self.__config = config self.__wandb_initialized = False - + self.__swanlab_initialized = False # Check evaluated checkpoints by logs in recover # NOTE: All previous evaluation steps with output will be marked # as logged, even if it is not really logged in wandb. @@ -228,6 +233,40 @@ class AutomaticEvaluator: settings=wandb.Settings(start_method="fork"), ) + def __lazy_swanlab_init(self): + if self.__swanlab_config.api_key: + swanlab.login(self.__swanlab_config.api_key) + if self.swanlab_config.config is None: + import yaml + + with open( + os.path.join( + constants.LOG_ROOT, + constants.experiment_name(), + constants.trial_name(), + "config.yaml", + ), + "r", + ) as f: + __config = yaml.safe_load(f) + else: + __config = self.swanlab_config.config + __config["FRAMEWORK"] = "AReaL" + swanlab.init( + project=self.__swanlab_config.project or constants.experiment_name(), + experiment_name=self.__swanlab_config.name + or f"{constants.trial_name()}_eval", + config=__config, + logdir=self.__swanlab_config.logdir + or os.path.join( + constants.LOG_ROOT, + constants.experiment_name(), + constants.trial_name(), + "swanlab", + ), + mode=self.__swanlab_config.mode, + ) + def step(self): # Check whether a new evaluation step should be created ckpt_parent = os.path.join( @@ -292,6 +331,9 @@ class AutomaticEvaluator: if not self.__wandb_initialized: self.__lazy_wandb_init() self.__wandb_initialized = True + if not self.__swanlab_initialized: + self.__lazy_swanlab_init() + self.__swanlab_initialized = True self.__eval_steps[log_step].log(self.__config) @property diff --git a/realhf/system/master_worker.py b/realhf/system/master_worker.py index 9a80c7c..ef4d1ed 100644 --- a/realhf/system/master_worker.py +++ b/realhf/system/master_worker.py @@ -12,6 +12,7 @@ from typing import Dict import colorama import networkx as nx import numpy as np +import swanlab import wandb from tensorboardX import SummaryWriter @@ -312,6 +313,40 @@ class MasterWorker(worker_base.AsyncWorker): resume="allow", settings=wandb.Settings(start_method="fork"), ) + + # swanlab init, connect to remote or local swanlab host + if self.swanlab_config.mode != "disabled" and self.swanlab_config.api_key: + swanlab.login(self.swanlab_config.api_key) + if self.swanlab_config.config is None: + import yaml + + with open( + os.path.join( + constants.LOG_ROOT, + constants.experiment_name(), + constants.trial_name(), + "config.yaml", + ), + "r", + ) as f: + __config = yaml.safe_load(f) + else: + __config = self.swanlab_config.config + __config["FRAMEWORK"] = "AReaL" + swanlab.init( + project=self.swanlab_config.project or constants.experiment_name(), + experiment_name=self.swanlab_config.name + or f"{constants.trial_name()}_train", + config=__config, + logdir=self.swanlab_config.logdir + or os.path.join( + constants.LOG_ROOT, + constants.experiment_name(), + constants.trial_name(), + "swanlab", + ), + mode=self.swanlab_config.mode, + ) # tensorboard logging self.__summary_writer = None if self.tensorboard_config.path is not None: @@ -487,7 +522,7 @@ class MasterWorker(worker_base.AsyncWorker): s += f"(global step {global_step}) finishes. " s += f"#End to end# execution time: *{e2e_time:.3f}*s. " s += f"Total time consumption: {time_since_configure:.3f}s. " - logging.log_wandb_tensorboard({"timeperf/e2e": e2e_time}) + logging.log_swanlab_wandb_tensorboard({"timeperf/e2e": e2e_time}) if len(self.e2e_time_history) > 2: remaining_steps = self._steps_per_epoch - epoch_step remaining_epochs = self.__total_train_epochs - epoch @@ -540,6 +575,7 @@ class MasterWorker(worker_base.AsyncWorker): ) wandb.finish() + swanlab.finish() if self.__summary_writer is not None: self.__summary_writer.close() gc.collect() diff --git a/realhf/system/model_function_call.py b/realhf/system/model_function_call.py index 659dc6e..8bda855 100644 --- a/realhf/system/model_function_call.py +++ b/realhf/system/model_function_call.py @@ -10,6 +10,7 @@ import uuid from collections import defaultdict from typing import Dict, Hashable, List, Set, Tuple +import swanlab import wandb from tensorboardX import SummaryWriter @@ -442,7 +443,7 @@ class ModelFunctionCall: logger.info( f"RPC name {rpc.name} returns\n{data_api.tabulate_stats(res)}" ) - logging.log_wandb_tensorboard( + logging.log_swanlab_wandb_tensorboard( res, step=ctrl.step_info.global_step, summary_writer=self.summary_writer, @@ -453,7 +454,7 @@ class ModelFunctionCall: f"RPC name {rpc.name} returns ({j + 1}/{len(res)})\n{data_api.tabulate_stats(r)}" ) offset = len(res) * ctrl.step_info.global_step - logging.log_wandb_tensorboard( + logging.log_swanlab_wandb_tensorboard( r, step=offset + j, summary_writer=self.summary_writer, @@ -465,11 +466,10 @@ class ModelFunctionCall: for time_record in time_records: stats_tracker.scalar(**time_record) time_stats = stats_tracker.export() - logging.log_wandb_tensorboard( + logging.log_swanlab_wandb_tensorboard( time_stats, summary_writer=self.summary_writer, ) - logger.info( f"Model rpc {rpc.name} finished. " f"Request-reply time {time.perf_counter() - tik:.4f}s. " diff --git a/realhf/system/worker_base.py b/realhf/system/worker_base.py index bbd8060..d673f50 100644 --- a/realhf/system/worker_base.py +++ b/realhf/system/worker_base.py @@ -580,7 +580,9 @@ class Worker: ) expr_config.lazy_init() self.wandb_config = expr_config.wandb + self.swanlab_config = expr_config.swanlab os.environ["WANDB_MODE"] = self.wandb_config.mode + os.environ["SWANLAB_MODE"] = self.swanlab_config.mode self.tensorboard_config = expr_config.tensorboard config = expr_config.resolve_worker_config( self.__worker_type, self.__worker_index diff --git a/requirements.txt b/requirements.txt index 5f775f6..4ffab7e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -68,4 +68,5 @@ python_dateutil word2number Pebble timeout-decorator -prettytable \ No newline at end of file +prettytable +swanlab[dashboard] \ No newline at end of file diff --git a/training/configs/async-ppo.yaml b/training/configs/async-ppo.yaml index bb4cd6d..7c6e609 100644 --- a/training/configs/async-ppo.yaml +++ b/training/configs/async-ppo.yaml @@ -14,6 +14,13 @@ wandb: notes: null tags: null config: null +swanlab: + mode: disabled + api_key: null + project: null + name: null + config: null + logdir: null tensorboard: path: null recover_mode: auto diff --git a/training/configs/sft.yaml b/training/configs/sft.yaml index 822369b..109ce97 100644 --- a/training/configs/sft.yaml +++ b/training/configs/sft.yaml @@ -14,6 +14,13 @@ wandb: notes: null tags: null config: null +swanlab: + mode: disabled + api_key: null + project: null + name: null + config: null + logdir: null tensorboard: path: null recover_mode: auto diff --git a/training/configs/sync-ppo.yaml b/training/configs/sync-ppo.yaml index 88ae35f..cef7523 100644 --- a/training/configs/sync-ppo.yaml +++ b/training/configs/sync-ppo.yaml @@ -14,6 +14,13 @@ wandb: notes: null tags: null config: null +swanlab: + mode: disabled + api_key: null + project: null + name: null + config: null + logdir: null tensorboard: path: null recover_mode: auto diff --git a/training/utils.py b/training/utils.py index 515f675..a20ecbf 100644 --- a/training/utils.py +++ b/training/utils.py @@ -90,6 +90,7 @@ class RayWorker: worker_info.experiment_name, worker_info.trial_name ) self.worker.wandb_config = expr_config.wandb + self.worker.swanlab_config = expr_config.swanlab self.worker.tensorboard_config = expr_config.tensorboard self.logger = logging.getLogger(f"{self.worker_type} {idx}", "benchmark") self.logger.info(f"Configuring {self.worker_type}...") @@ -125,6 +126,7 @@ def _run_experiment(exp_cfg, expr_name, trial_name): # Initialize ray in the Ray cluster env_vars = constants.get_env_vars( WADNB_MODE=exp_cfg.wandb.mode, + SWANLAB_MODE=exp_cfg.swanlab.mode, REAL_MODE="ray", REAL_RECOVER_RUN="0", REAL_SAVE_RECOVER_STATES="1",