Support using SwanLab for experiment tracking (#98)

* Support using SwanLab for experiment tracking

* docs: improve WandB and SwanLab integration documentation
- Added official links for better user reference
- Used backticks to quote commands and parameters
- Unified mode settings to use "online" / "cloud" convention
- Merged WandB and SwanLab descriptions into a single concise statement
- Added note on using `swanlab.mode="local"` when server connection is unavailable

* refactor: update default value of api_key

* fix: correct help description from WandB to SwanLab in SwanLabConfig

* refactor: merge log_swanlab_tensorboard and log_wandb_tensorboard into log_swanlab_wandb_tensorboard

 - Unified logging logic for SwanLab, WandB, and TensorBoard to reduce code duplication

* chore: update swanlab version in dependency config files

 - Updated SwanLab version in pyproject.toml
 - Updated SwanLab version in requirements.txt

* refactor: enhance SwanLab config handling for logging purposes
- Config now uses provided arguments first
- Falls back to reading from config.yaml if no input is given

* docs: add note on using  when server connection is unavailable

* refactor: merge _LATEST_WANDB_STEP and _LATEST_SWANLAB_STEP into _LATEST_LOG_STEP

* Format code with black and isort

* chore: update swanlab version in dependency config files
- Updated SwanLab version in requirements.txt

* refactor: rename swanlab_wandb_data to log_data

---------

Co-authored-by: dubingnan <dubingnan@360.cn>
This commit is contained in:
xichengpro 2025-06-16 19:51:31 +08:00 committed by GitHub
parent f2f4b67bcd
commit bb14f022dc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 160 additions and 22 deletions

View File

@ -97,12 +97,15 @@ python3 training/main_sync_ppo.py --help
## Monitoring the Training Process ## Monitoring the Training Process
We recommend using Weights & Biases (wandb) for monitoring. Run `wandb login` or set the `WANDB_API_KEY` environment variable. Set `wandb.mode=online` in your configuration to upload training statistics. + We recommend using [Weights & Biases (wandb)](https://github.com/wandb/wandb) or [SwanLab](https://github.com/SwanHubX/SwanLab) for monitoring—run `wandb login` or `swanlab login`, or set the corresponding environment variable API key (`WANDB_API_KEY` or `SWANLAB_API_KEY`). Set `wandb.mode="online"` or `swanlab.mode="cloud"` in your configuration to upload training statistics. If you cannot connect to the server, you can also use `wandb.mode="offline"` or `swanlab.mode="local"` to save data locally without uploading.
You can also use TensorBoard by setting the `tensorboard.path` parameter. You can also use TensorBoard by setting the `tensorboard.path` parameter.
The main log will be saved to `${fileroot}/logs/${USER}/${experiment_name}/${trial_name}/main.log` and contains the statistics uploaded to wandb. The main log will be saved to `${fileroot}/logs/${USER}/${experiment_name}/${trial_name}/main.log` and contains the statistics uploaded to wandb.
If SwanLab is enabled, logs will be saved to the directory specified by `swanlab.logdir`.
### Key Training Statistics ### Key Training Statistics
- **`Epoch 1/5`**: Indicates the total epochs required and the current epoch being trained. - **`Epoch 1/5`**: Indicates the total epochs required and the current epoch being trained.

View File

@ -15,3 +15,4 @@ prettytable
timeout-decorator timeout-decorator
timeout_decorator timeout_decorator
wandb wandb
swanlab[dashboard]

View File

@ -61,6 +61,7 @@ dependencies = [
"colorlog", "colorlog",
"psutil", "psutil",
"pynvml", "pynvml",
"swanlab[dashboard]",
# Performance and compression # Performance and compression
"ninja", "ninja",

View File

@ -848,6 +848,16 @@ class WandBConfig:
config: Optional[Dict] = None config: Optional[Dict] = None
@dataclass
class SwanlabConfig:
project: Optional[str] = None
name: Optional[str] = None
config: Optional[Dict] = None
logdir: Optional[str] = None
mode: Optional[str] = "local"
api_key: Optional[str] = os.getenv("SWANLAB_API_KEY", None)
@dataclass @dataclass
class TensorBoardConfig: class TensorBoardConfig:
path: Optional[str] = None path: Optional[str] = None
@ -951,6 +961,10 @@ class BaseExperimentConfig:
default_factory=WandBConfig, default_factory=WandBConfig,
metadata={"help": "Weights & Biases configuration."}, metadata={"help": "Weights & Biases configuration."},
) )
swanlab: SwanlabConfig = field(
default_factory=SwanlabConfig,
metadata={"help": "SwanLab configuration."},
)
tensorboard: TensorBoardConfig = field( tensorboard: TensorBoardConfig = field(
default_factory=TensorBoardConfig, default_factory=TensorBoardConfig,
metadata={"help": "TensorBoard configuration. Only 'path' field required."}, metadata={"help": "TensorBoard configuration. Only 'path' field required."},
@ -1026,7 +1040,7 @@ class BaseExperimentConfig:
default=False, default=False,
metadata={ metadata={
"help": "Enable automatic evaluation during training. " "help": "Enable automatic evaluation during training. "
"Results logged to disk and WandB (if active)." "Results logged to disk and WandB or Swanlab(if active)."
}, },
) )
auto_eval_config: AutomaticEvaluator = field( auto_eval_config: AutomaticEvaluator = field(

View File

@ -11,6 +11,7 @@ import realhf.api.core.dfg as dfg
from realhf.api.cli_args import ( from realhf.api.cli_args import (
AutomaticEvaluator, AutomaticEvaluator,
ExperimentSaveEvalControl, ExperimentSaveEvalControl,
SwanlabConfig,
TensorBoardConfig, TensorBoardConfig,
WandBConfig, WandBConfig,
) )
@ -254,6 +255,7 @@ class ExperimentScheduling:
class ExperimentConfig: class ExperimentConfig:
exp_ctrl: ExperimentSaveEvalControl exp_ctrl: ExperimentSaveEvalControl
wandb: WandBConfig wandb: WandBConfig
swanlab: SwanlabConfig
tensorboard: TensorBoardConfig tensorboard: TensorBoardConfig
# dataflow # dataflow
model_rpcs: List[dfg.MFCDef] model_rpcs: List[dfg.MFCDef]

View File

@ -94,7 +94,7 @@ def main_start(args, job_group_id: str = "", recover_count: int = 0):
raise RuntimeError("Experiment initial setup failed.") from e raise RuntimeError("Experiment initial setup failed.") from e
evaluator = ( evaluator = (
AutomaticEvaluator(exp_cfg.evaluator, exp_cfg.wandb) AutomaticEvaluator(exp_cfg.evaluator, exp_cfg.wandb, exp_cfg.swanlab)
if exp_cfg.auto_eval if exp_cfg.auto_eval
else None else None
) )

View File

@ -141,19 +141,29 @@ def getLogger(
return logging.getLogger(name) return logging.getLogger(name)
_LATEST_WANDB_STEP = 0 _LATEST_LOG_STEP = 0
def log_wandb_tensorboard(data, step=None, summary_writer=None): def log_swanlab_wandb_tensorboard(data, step=None, summary_writer=None):
# Logs data to SwanLab、 wandb、 TensorBoard.
global _LATEST_LOG_STEP
if step is None:
step = _LATEST_LOG_STEP
else:
_LATEST_LOG_STEP = max(_LATEST_LOG_STEP, step)
# swanlab
import swanlab
swanlab.log(data, step=step)
# wandb
import wandb import wandb
global _LATEST_WANDB_STEP
if step is None:
step = _LATEST_WANDB_STEP
else:
_LATEST_WANDB_STEP = max(_LATEST_WANDB_STEP, step)
wandb.log(data, step=step) wandb.log(data, step=step)
# tensorboard
if summary_writer is not None: if summary_writer is not None:
for key, val in data.items(): for key, val in data.items():
summary_writer.add_scalar(f"{key}", val, step) summary_writer.add_scalar(f"{key}", val, step)

View File

@ -331,6 +331,7 @@ class AsyncRLExperimentConfig(CommonExperimentConfig, AsyncRLOptions):
return ExperimentConfig( return ExperimentConfig(
exp_ctrl=self.exp_ctrl, exp_ctrl=self.exp_ctrl,
wandb=self.wandb, wandb=self.wandb,
swanlab=self.swanlab,
tensorboard=self.tensorboard, tensorboard=self.tensorboard,
# NOTE: master and model worker only see RPCs without generation # NOTE: master and model worker only see RPCs without generation
model_rpcs=[ model_rpcs=[

View File

@ -564,6 +564,7 @@ class CommonExperimentConfig(BaseExperimentConfig, Experiment):
return ExperimentConfig( return ExperimentConfig(
exp_ctrl=self.exp_ctrl, exp_ctrl=self.exp_ctrl,
wandb=self.wandb, wandb=self.wandb,
swanlab=self.swanlab,
tensorboard=self.tensorboard, tensorboard=self.tensorboard,
model_rpcs=[rpc_alloc.rpc for rpc_alloc in rpc_allocs], model_rpcs=[rpc_alloc.rpc for rpc_alloc in rpc_allocs],
model_worker=model_worker, model_worker=model_worker,

View File

@ -370,6 +370,7 @@ class PPOMATHConfig(CommonExperimentConfig, PPOMATHExperimentOptions):
return ExperimentConfig( return ExperimentConfig(
exp_ctrl=self.exp_ctrl, exp_ctrl=self.exp_ctrl,
wandb=self.wandb, wandb=self.wandb,
swanlab=self.swanlab,
tensorboard=self.tensorboard, tensorboard=self.tensorboard,
model_rpcs=[rpc_alloc.rpc for rpc_alloc in rpc_allocs], model_rpcs=[rpc_alloc.rpc for rpc_alloc in rpc_allocs],
model_worker=model_worker, model_worker=model_worker,

View File

@ -8,6 +8,7 @@ import subprocess
import time import time
from typing import Dict, Optional from typing import Dict, Optional
import swanlab
import wandb import wandb
import realhf.api.core.system_api as config_pkg import realhf.api.core.system_api as config_pkg
@ -125,13 +126,15 @@ class EvaluationStep:
self.status = EvaluationStepStatus.FAILED self.status = EvaluationStepStatus.FAILED
return False return False
wandb_data = {} log_data = {}
for data_name, d in data.items(): for data_name, d in data.items():
for k, v in d.items(): for k, v in d.items():
wandb_data[f"{data_name}_{k}"] = v log_data[f"{data_name}_{k}"] = v
wandb.log(wandb_data, step=self.global_step) wandb.log(log_data, step=self.global_step)
swanlab.log(log_data, step=self.global_step)
self.status = EvaluationStepStatus.LOGGED self.status = EvaluationStepStatus.LOGGED
logger.info(f"Logging eval result {wandb_data} to step {self.global_step}") logger.info(f"Logging eval result {log_data} to step {self.global_step}")
return True return True
def check(self): def check(self):
@ -154,13 +157,15 @@ class AutomaticEvaluator:
self, self,
config: config_pkg.AutomaticEvaluator, config: config_pkg.AutomaticEvaluator,
wandb_config: config_pkg.WandBConfig, wandb_config: config_pkg.WandBConfig,
swanlab_config: config_pkg.SwanlabConfig,
): ):
self.__eval_steps: Dict[int, EvaluationStep] = {} self.__eval_steps: Dict[int, EvaluationStep] = {}
self.__max_concurrent_jobs = config.max_concurrent_jobs self.__max_concurrent_jobs = config.max_concurrent_jobs
self.__wandb_config = wandb_config self.__wandb_config = wandb_config
self.__swanlab_config = swanlab_config
self.__config = config self.__config = config
self.__wandb_initialized = False self.__wandb_initialized = False
self.__swanlab_initialized = False
# Check evaluated checkpoints by logs in recover # Check evaluated checkpoints by logs in recover
# NOTE: All previous evaluation steps with output will be marked # NOTE: All previous evaluation steps with output will be marked
# as logged, even if it is not really logged in wandb. # as logged, even if it is not really logged in wandb.
@ -228,6 +233,40 @@ class AutomaticEvaluator:
settings=wandb.Settings(start_method="fork"), settings=wandb.Settings(start_method="fork"),
) )
def __lazy_swanlab_init(self):
if self.__swanlab_config.api_key:
swanlab.login(self.__swanlab_config.api_key)
if self.swanlab_config.config is None:
import yaml
with open(
os.path.join(
constants.LOG_ROOT,
constants.experiment_name(),
constants.trial_name(),
"config.yaml",
),
"r",
) as f:
__config = yaml.safe_load(f)
else:
__config = self.swanlab_config.config
__config["FRAMEWORK"] = "AReaL"
swanlab.init(
project=self.__swanlab_config.project or constants.experiment_name(),
experiment_name=self.__swanlab_config.name
or f"{constants.trial_name()}_eval",
config=__config,
logdir=self.__swanlab_config.logdir
or os.path.join(
constants.LOG_ROOT,
constants.experiment_name(),
constants.trial_name(),
"swanlab",
),
mode=self.__swanlab_config.mode,
)
def step(self): def step(self):
# Check whether a new evaluation step should be created # Check whether a new evaluation step should be created
ckpt_parent = os.path.join( ckpt_parent = os.path.join(
@ -292,6 +331,9 @@ class AutomaticEvaluator:
if not self.__wandb_initialized: if not self.__wandb_initialized:
self.__lazy_wandb_init() self.__lazy_wandb_init()
self.__wandb_initialized = True self.__wandb_initialized = True
if not self.__swanlab_initialized:
self.__lazy_swanlab_init()
self.__swanlab_initialized = True
self.__eval_steps[log_step].log(self.__config) self.__eval_steps[log_step].log(self.__config)
@property @property

View File

@ -12,6 +12,7 @@ from typing import Dict
import colorama import colorama
import networkx as nx import networkx as nx
import numpy as np import numpy as np
import swanlab
import wandb import wandb
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
@ -312,6 +313,40 @@ class MasterWorker(worker_base.AsyncWorker):
resume="allow", resume="allow",
settings=wandb.Settings(start_method="fork"), settings=wandb.Settings(start_method="fork"),
) )
# swanlab init, connect to remote or local swanlab host
if self.swanlab_config.mode != "disabled" and self.swanlab_config.api_key:
swanlab.login(self.swanlab_config.api_key)
if self.swanlab_config.config is None:
import yaml
with open(
os.path.join(
constants.LOG_ROOT,
constants.experiment_name(),
constants.trial_name(),
"config.yaml",
),
"r",
) as f:
__config = yaml.safe_load(f)
else:
__config = self.swanlab_config.config
__config["FRAMEWORK"] = "AReaL"
swanlab.init(
project=self.swanlab_config.project or constants.experiment_name(),
experiment_name=self.swanlab_config.name
or f"{constants.trial_name()}_train",
config=__config,
logdir=self.swanlab_config.logdir
or os.path.join(
constants.LOG_ROOT,
constants.experiment_name(),
constants.trial_name(),
"swanlab",
),
mode=self.swanlab_config.mode,
)
# tensorboard logging # tensorboard logging
self.__summary_writer = None self.__summary_writer = None
if self.tensorboard_config.path is not None: if self.tensorboard_config.path is not None:
@ -487,7 +522,7 @@ class MasterWorker(worker_base.AsyncWorker):
s += f"(global step {global_step}) finishes. " s += f"(global step {global_step}) finishes. "
s += f"#End to end# execution time: *{e2e_time:.3f}*s. " s += f"#End to end# execution time: *{e2e_time:.3f}*s. "
s += f"Total time consumption: {time_since_configure:.3f}s. " s += f"Total time consumption: {time_since_configure:.3f}s. "
logging.log_wandb_tensorboard({"timeperf/e2e": e2e_time}) logging.log_swanlab_wandb_tensorboard({"timeperf/e2e": e2e_time})
if len(self.e2e_time_history) > 2: if len(self.e2e_time_history) > 2:
remaining_steps = self._steps_per_epoch - epoch_step remaining_steps = self._steps_per_epoch - epoch_step
remaining_epochs = self.__total_train_epochs - epoch remaining_epochs = self.__total_train_epochs - epoch
@ -540,6 +575,7 @@ class MasterWorker(worker_base.AsyncWorker):
) )
wandb.finish() wandb.finish()
swanlab.finish()
if self.__summary_writer is not None: if self.__summary_writer is not None:
self.__summary_writer.close() self.__summary_writer.close()
gc.collect() gc.collect()

View File

@ -10,6 +10,7 @@ import uuid
from collections import defaultdict from collections import defaultdict
from typing import Dict, Hashable, List, Set, Tuple from typing import Dict, Hashable, List, Set, Tuple
import swanlab
import wandb import wandb
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
@ -442,7 +443,7 @@ class ModelFunctionCall:
logger.info( logger.info(
f"RPC name {rpc.name} returns\n{data_api.tabulate_stats(res)}" f"RPC name {rpc.name} returns\n{data_api.tabulate_stats(res)}"
) )
logging.log_wandb_tensorboard( logging.log_swanlab_wandb_tensorboard(
res, res,
step=ctrl.step_info.global_step, step=ctrl.step_info.global_step,
summary_writer=self.summary_writer, summary_writer=self.summary_writer,
@ -453,7 +454,7 @@ class ModelFunctionCall:
f"RPC name {rpc.name} returns ({j + 1}/{len(res)})\n{data_api.tabulate_stats(r)}" f"RPC name {rpc.name} returns ({j + 1}/{len(res)})\n{data_api.tabulate_stats(r)}"
) )
offset = len(res) * ctrl.step_info.global_step offset = len(res) * ctrl.step_info.global_step
logging.log_wandb_tensorboard( logging.log_swanlab_wandb_tensorboard(
r, r,
step=offset + j, step=offset + j,
summary_writer=self.summary_writer, summary_writer=self.summary_writer,
@ -465,11 +466,10 @@ class ModelFunctionCall:
for time_record in time_records: for time_record in time_records:
stats_tracker.scalar(**time_record) stats_tracker.scalar(**time_record)
time_stats = stats_tracker.export() time_stats = stats_tracker.export()
logging.log_wandb_tensorboard( logging.log_swanlab_wandb_tensorboard(
time_stats, time_stats,
summary_writer=self.summary_writer, summary_writer=self.summary_writer,
) )
logger.info( logger.info(
f"Model rpc {rpc.name} finished. " f"Model rpc {rpc.name} finished. "
f"Request-reply time {time.perf_counter() - tik:.4f}s. " f"Request-reply time {time.perf_counter() - tik:.4f}s. "

View File

@ -580,7 +580,9 @@ class Worker:
) )
expr_config.lazy_init() expr_config.lazy_init()
self.wandb_config = expr_config.wandb self.wandb_config = expr_config.wandb
self.swanlab_config = expr_config.swanlab
os.environ["WANDB_MODE"] = self.wandb_config.mode os.environ["WANDB_MODE"] = self.wandb_config.mode
os.environ["SWANLAB_MODE"] = self.swanlab_config.mode
self.tensorboard_config = expr_config.tensorboard self.tensorboard_config = expr_config.tensorboard
config = expr_config.resolve_worker_config( config = expr_config.resolve_worker_config(
self.__worker_type, self.__worker_index self.__worker_type, self.__worker_index

View File

@ -68,4 +68,5 @@ python_dateutil
word2number word2number
Pebble Pebble
timeout-decorator timeout-decorator
prettytable prettytable
swanlab[dashboard]

View File

@ -14,6 +14,13 @@ wandb:
notes: null notes: null
tags: null tags: null
config: null config: null
swanlab:
mode: disabled
api_key: null
project: null
name: null
config: null
logdir: null
tensorboard: tensorboard:
path: null path: null
recover_mode: auto recover_mode: auto

View File

@ -14,6 +14,13 @@ wandb:
notes: null notes: null
tags: null tags: null
config: null config: null
swanlab:
mode: disabled
api_key: null
project: null
name: null
config: null
logdir: null
tensorboard: tensorboard:
path: null path: null
recover_mode: auto recover_mode: auto

View File

@ -14,6 +14,13 @@ wandb:
notes: null notes: null
tags: null tags: null
config: null config: null
swanlab:
mode: disabled
api_key: null
project: null
name: null
config: null
logdir: null
tensorboard: tensorboard:
path: null path: null
recover_mode: auto recover_mode: auto

View File

@ -90,6 +90,7 @@ class RayWorker:
worker_info.experiment_name, worker_info.trial_name worker_info.experiment_name, worker_info.trial_name
) )
self.worker.wandb_config = expr_config.wandb self.worker.wandb_config = expr_config.wandb
self.worker.swanlab_config = expr_config.swanlab
self.worker.tensorboard_config = expr_config.tensorboard self.worker.tensorboard_config = expr_config.tensorboard
self.logger = logging.getLogger(f"{self.worker_type} {idx}", "benchmark") self.logger = logging.getLogger(f"{self.worker_type} {idx}", "benchmark")
self.logger.info(f"Configuring {self.worker_type}...") self.logger.info(f"Configuring {self.worker_type}...")
@ -125,6 +126,7 @@ def _run_experiment(exp_cfg, expr_name, trial_name):
# Initialize ray in the Ray cluster # Initialize ray in the Ray cluster
env_vars = constants.get_env_vars( env_vars = constants.get_env_vars(
WADNB_MODE=exp_cfg.wandb.mode, WADNB_MODE=exp_cfg.wandb.mode,
SWANLAB_MODE=exp_cfg.swanlab.mode,
REAL_MODE="ray", REAL_MODE="ray",
REAL_RECOVER_RUN="0", REAL_RECOVER_RUN="0",
REAL_SAVE_RECOVER_STATES="1", REAL_SAVE_RECOVER_STATES="1",