Support using SwanLab for experiment tracking (#98)

* Support using SwanLab for experiment tracking

* docs: improve WandB and SwanLab integration documentation
- Added official links for better user reference
- Used backticks to quote commands and parameters
- Unified mode settings to use "online" / "cloud" convention
- Merged WandB and SwanLab descriptions into a single concise statement
- Added note on using `swanlab.mode="local"` when server connection is unavailable

* refactor: update default value of api_key

* fix: correct help description from WandB to SwanLab in SwanLabConfig

* refactor: merge log_swanlab_tensorboard and log_wandb_tensorboard into log_swanlab_wandb_tensorboard

 - Unified logging logic for SwanLab, WandB, and TensorBoard to reduce code duplication

* chore: update swanlab version in dependency config files

 - Updated SwanLab version in pyproject.toml
 - Updated SwanLab version in requirements.txt

* refactor: enhance SwanLab config handling for logging purposes
- Config now uses provided arguments first
- Falls back to reading from config.yaml if no input is given

* docs: add note on using  when server connection is unavailable

* refactor: merge _LATEST_WANDB_STEP and _LATEST_SWANLAB_STEP into _LATEST_LOG_STEP

* Format code with black and isort

* chore: update swanlab version in dependency config files
- Updated SwanLab version in requirements.txt

* refactor: rename swanlab_wandb_data to log_data

---------

Co-authored-by: dubingnan <dubingnan@360.cn>
This commit is contained in:
xichengpro 2025-06-16 19:51:31 +08:00 committed by GitHub
parent f2f4b67bcd
commit bb14f022dc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 160 additions and 22 deletions

View File

@ -97,12 +97,15 @@ python3 training/main_sync_ppo.py --help
## Monitoring the Training Process
We recommend using Weights & Biases (wandb) for monitoring. Run `wandb login` or set the `WANDB_API_KEY` environment variable. Set `wandb.mode=online` in your configuration to upload training statistics.
+ We recommend using [Weights & Biases (wandb)](https://github.com/wandb/wandb) or [SwanLab](https://github.com/SwanHubX/SwanLab) for monitoring—run `wandb login` or `swanlab login`, or set the corresponding environment variable API key (`WANDB_API_KEY` or `SWANLAB_API_KEY`). Set `wandb.mode="online"` or `swanlab.mode="cloud"` in your configuration to upload training statistics. If you cannot connect to the server, you can also use `wandb.mode="offline"` or `swanlab.mode="local"` to save data locally without uploading.
You can also use TensorBoard by setting the `tensorboard.path` parameter.
The main log will be saved to `${fileroot}/logs/${USER}/${experiment_name}/${trial_name}/main.log` and contains the statistics uploaded to wandb.
If SwanLab is enabled, logs will be saved to the directory specified by `swanlab.logdir`.
### Key Training Statistics
- **`Epoch 1/5`**: Indicates the total epochs required and the current epoch being trained.

View File

@ -15,3 +15,4 @@ prettytable
timeout-decorator
timeout_decorator
wandb
swanlab[dashboard]

View File

@ -61,6 +61,7 @@ dependencies = [
"colorlog",
"psutil",
"pynvml",
"swanlab[dashboard]",
# Performance and compression
"ninja",

View File

@ -848,6 +848,16 @@ class WandBConfig:
config: Optional[Dict] = None
@dataclass
class SwanlabConfig:
project: Optional[str] = None
name: Optional[str] = None
config: Optional[Dict] = None
logdir: Optional[str] = None
mode: Optional[str] = "local"
api_key: Optional[str] = os.getenv("SWANLAB_API_KEY", None)
@dataclass
class TensorBoardConfig:
path: Optional[str] = None
@ -951,6 +961,10 @@ class BaseExperimentConfig:
default_factory=WandBConfig,
metadata={"help": "Weights & Biases configuration."},
)
swanlab: SwanlabConfig = field(
default_factory=SwanlabConfig,
metadata={"help": "SwanLab configuration."},
)
tensorboard: TensorBoardConfig = field(
default_factory=TensorBoardConfig,
metadata={"help": "TensorBoard configuration. Only 'path' field required."},
@ -1026,7 +1040,7 @@ class BaseExperimentConfig:
default=False,
metadata={
"help": "Enable automatic evaluation during training. "
"Results logged to disk and WandB (if active)."
"Results logged to disk and WandB or Swanlab(if active)."
},
)
auto_eval_config: AutomaticEvaluator = field(

View File

@ -11,6 +11,7 @@ import realhf.api.core.dfg as dfg
from realhf.api.cli_args import (
AutomaticEvaluator,
ExperimentSaveEvalControl,
SwanlabConfig,
TensorBoardConfig,
WandBConfig,
)
@ -254,6 +255,7 @@ class ExperimentScheduling:
class ExperimentConfig:
exp_ctrl: ExperimentSaveEvalControl
wandb: WandBConfig
swanlab: SwanlabConfig
tensorboard: TensorBoardConfig
# dataflow
model_rpcs: List[dfg.MFCDef]

View File

@ -94,7 +94,7 @@ def main_start(args, job_group_id: str = "", recover_count: int = 0):
raise RuntimeError("Experiment initial setup failed.") from e
evaluator = (
AutomaticEvaluator(exp_cfg.evaluator, exp_cfg.wandb)
AutomaticEvaluator(exp_cfg.evaluator, exp_cfg.wandb, exp_cfg.swanlab)
if exp_cfg.auto_eval
else None
)

View File

@ -141,19 +141,29 @@ def getLogger(
return logging.getLogger(name)
_LATEST_WANDB_STEP = 0
_LATEST_LOG_STEP = 0
def log_wandb_tensorboard(data, step=None, summary_writer=None):
def log_swanlab_wandb_tensorboard(data, step=None, summary_writer=None):
# Logs data to SwanLab、 wandb、 TensorBoard.
global _LATEST_LOG_STEP
if step is None:
step = _LATEST_LOG_STEP
else:
_LATEST_LOG_STEP = max(_LATEST_LOG_STEP, step)
# swanlab
import swanlab
swanlab.log(data, step=step)
# wandb
import wandb
global _LATEST_WANDB_STEP
if step is None:
step = _LATEST_WANDB_STEP
else:
_LATEST_WANDB_STEP = max(_LATEST_WANDB_STEP, step)
wandb.log(data, step=step)
# tensorboard
if summary_writer is not None:
for key, val in data.items():
summary_writer.add_scalar(f"{key}", val, step)

View File

@ -331,6 +331,7 @@ class AsyncRLExperimentConfig(CommonExperimentConfig, AsyncRLOptions):
return ExperimentConfig(
exp_ctrl=self.exp_ctrl,
wandb=self.wandb,
swanlab=self.swanlab,
tensorboard=self.tensorboard,
# NOTE: master and model worker only see RPCs without generation
model_rpcs=[

View File

@ -564,6 +564,7 @@ class CommonExperimentConfig(BaseExperimentConfig, Experiment):
return ExperimentConfig(
exp_ctrl=self.exp_ctrl,
wandb=self.wandb,
swanlab=self.swanlab,
tensorboard=self.tensorboard,
model_rpcs=[rpc_alloc.rpc for rpc_alloc in rpc_allocs],
model_worker=model_worker,

View File

@ -370,6 +370,7 @@ class PPOMATHConfig(CommonExperimentConfig, PPOMATHExperimentOptions):
return ExperimentConfig(
exp_ctrl=self.exp_ctrl,
wandb=self.wandb,
swanlab=self.swanlab,
tensorboard=self.tensorboard,
model_rpcs=[rpc_alloc.rpc for rpc_alloc in rpc_allocs],
model_worker=model_worker,

View File

@ -8,6 +8,7 @@ import subprocess
import time
from typing import Dict, Optional
import swanlab
import wandb
import realhf.api.core.system_api as config_pkg
@ -125,13 +126,15 @@ class EvaluationStep:
self.status = EvaluationStepStatus.FAILED
return False
wandb_data = {}
log_data = {}
for data_name, d in data.items():
for k, v in d.items():
wandb_data[f"{data_name}_{k}"] = v
wandb.log(wandb_data, step=self.global_step)
log_data[f"{data_name}_{k}"] = v
wandb.log(log_data, step=self.global_step)
swanlab.log(log_data, step=self.global_step)
self.status = EvaluationStepStatus.LOGGED
logger.info(f"Logging eval result {wandb_data} to step {self.global_step}")
logger.info(f"Logging eval result {log_data} to step {self.global_step}")
return True
def check(self):
@ -154,13 +157,15 @@ class AutomaticEvaluator:
self,
config: config_pkg.AutomaticEvaluator,
wandb_config: config_pkg.WandBConfig,
swanlab_config: config_pkg.SwanlabConfig,
):
self.__eval_steps: Dict[int, EvaluationStep] = {}
self.__max_concurrent_jobs = config.max_concurrent_jobs
self.__wandb_config = wandb_config
self.__swanlab_config = swanlab_config
self.__config = config
self.__wandb_initialized = False
self.__swanlab_initialized = False
# Check evaluated checkpoints by logs in recover
# NOTE: All previous evaluation steps with output will be marked
# as logged, even if it is not really logged in wandb.
@ -228,6 +233,40 @@ class AutomaticEvaluator:
settings=wandb.Settings(start_method="fork"),
)
def __lazy_swanlab_init(self):
if self.__swanlab_config.api_key:
swanlab.login(self.__swanlab_config.api_key)
if self.swanlab_config.config is None:
import yaml
with open(
os.path.join(
constants.LOG_ROOT,
constants.experiment_name(),
constants.trial_name(),
"config.yaml",
),
"r",
) as f:
__config = yaml.safe_load(f)
else:
__config = self.swanlab_config.config
__config["FRAMEWORK"] = "AReaL"
swanlab.init(
project=self.__swanlab_config.project or constants.experiment_name(),
experiment_name=self.__swanlab_config.name
or f"{constants.trial_name()}_eval",
config=__config,
logdir=self.__swanlab_config.logdir
or os.path.join(
constants.LOG_ROOT,
constants.experiment_name(),
constants.trial_name(),
"swanlab",
),
mode=self.__swanlab_config.mode,
)
def step(self):
# Check whether a new evaluation step should be created
ckpt_parent = os.path.join(
@ -292,6 +331,9 @@ class AutomaticEvaluator:
if not self.__wandb_initialized:
self.__lazy_wandb_init()
self.__wandb_initialized = True
if not self.__swanlab_initialized:
self.__lazy_swanlab_init()
self.__swanlab_initialized = True
self.__eval_steps[log_step].log(self.__config)
@property

View File

@ -12,6 +12,7 @@ from typing import Dict
import colorama
import networkx as nx
import numpy as np
import swanlab
import wandb
from tensorboardX import SummaryWriter
@ -312,6 +313,40 @@ class MasterWorker(worker_base.AsyncWorker):
resume="allow",
settings=wandb.Settings(start_method="fork"),
)
# swanlab init, connect to remote or local swanlab host
if self.swanlab_config.mode != "disabled" and self.swanlab_config.api_key:
swanlab.login(self.swanlab_config.api_key)
if self.swanlab_config.config is None:
import yaml
with open(
os.path.join(
constants.LOG_ROOT,
constants.experiment_name(),
constants.trial_name(),
"config.yaml",
),
"r",
) as f:
__config = yaml.safe_load(f)
else:
__config = self.swanlab_config.config
__config["FRAMEWORK"] = "AReaL"
swanlab.init(
project=self.swanlab_config.project or constants.experiment_name(),
experiment_name=self.swanlab_config.name
or f"{constants.trial_name()}_train",
config=__config,
logdir=self.swanlab_config.logdir
or os.path.join(
constants.LOG_ROOT,
constants.experiment_name(),
constants.trial_name(),
"swanlab",
),
mode=self.swanlab_config.mode,
)
# tensorboard logging
self.__summary_writer = None
if self.tensorboard_config.path is not None:
@ -487,7 +522,7 @@ class MasterWorker(worker_base.AsyncWorker):
s += f"(global step {global_step}) finishes. "
s += f"#End to end# execution time: *{e2e_time:.3f}*s. "
s += f"Total time consumption: {time_since_configure:.3f}s. "
logging.log_wandb_tensorboard({"timeperf/e2e": e2e_time})
logging.log_swanlab_wandb_tensorboard({"timeperf/e2e": e2e_time})
if len(self.e2e_time_history) > 2:
remaining_steps = self._steps_per_epoch - epoch_step
remaining_epochs = self.__total_train_epochs - epoch
@ -540,6 +575,7 @@ class MasterWorker(worker_base.AsyncWorker):
)
wandb.finish()
swanlab.finish()
if self.__summary_writer is not None:
self.__summary_writer.close()
gc.collect()

View File

@ -10,6 +10,7 @@ import uuid
from collections import defaultdict
from typing import Dict, Hashable, List, Set, Tuple
import swanlab
import wandb
from tensorboardX import SummaryWriter
@ -442,7 +443,7 @@ class ModelFunctionCall:
logger.info(
f"RPC name {rpc.name} returns\n{data_api.tabulate_stats(res)}"
)
logging.log_wandb_tensorboard(
logging.log_swanlab_wandb_tensorboard(
res,
step=ctrl.step_info.global_step,
summary_writer=self.summary_writer,
@ -453,7 +454,7 @@ class ModelFunctionCall:
f"RPC name {rpc.name} returns ({j + 1}/{len(res)})\n{data_api.tabulate_stats(r)}"
)
offset = len(res) * ctrl.step_info.global_step
logging.log_wandb_tensorboard(
logging.log_swanlab_wandb_tensorboard(
r,
step=offset + j,
summary_writer=self.summary_writer,
@ -465,11 +466,10 @@ class ModelFunctionCall:
for time_record in time_records:
stats_tracker.scalar(**time_record)
time_stats = stats_tracker.export()
logging.log_wandb_tensorboard(
logging.log_swanlab_wandb_tensorboard(
time_stats,
summary_writer=self.summary_writer,
)
logger.info(
f"Model rpc {rpc.name} finished. "
f"Request-reply time {time.perf_counter() - tik:.4f}s. "

View File

@ -580,7 +580,9 @@ class Worker:
)
expr_config.lazy_init()
self.wandb_config = expr_config.wandb
self.swanlab_config = expr_config.swanlab
os.environ["WANDB_MODE"] = self.wandb_config.mode
os.environ["SWANLAB_MODE"] = self.swanlab_config.mode
self.tensorboard_config = expr_config.tensorboard
config = expr_config.resolve_worker_config(
self.__worker_type, self.__worker_index

View File

@ -68,4 +68,5 @@ python_dateutil
word2number
Pebble
timeout-decorator
prettytable
prettytable
swanlab[dashboard]

View File

@ -14,6 +14,13 @@ wandb:
notes: null
tags: null
config: null
swanlab:
mode: disabled
api_key: null
project: null
name: null
config: null
logdir: null
tensorboard:
path: null
recover_mode: auto

View File

@ -14,6 +14,13 @@ wandb:
notes: null
tags: null
config: null
swanlab:
mode: disabled
api_key: null
project: null
name: null
config: null
logdir: null
tensorboard:
path: null
recover_mode: auto

View File

@ -14,6 +14,13 @@ wandb:
notes: null
tags: null
config: null
swanlab:
mode: disabled
api_key: null
project: null
name: null
config: null
logdir: null
tensorboard:
path: null
recover_mode: auto

View File

@ -90,6 +90,7 @@ class RayWorker:
worker_info.experiment_name, worker_info.trial_name
)
self.worker.wandb_config = expr_config.wandb
self.worker.swanlab_config = expr_config.swanlab
self.worker.tensorboard_config = expr_config.tensorboard
self.logger = logging.getLogger(f"{self.worker_type} {idx}", "benchmark")
self.logger.info(f"Configuring {self.worker_type}...")
@ -125,6 +126,7 @@ def _run_experiment(exp_cfg, expr_name, trial_name):
# Initialize ray in the Ray cluster
env_vars = constants.get_env_vars(
WADNB_MODE=exp_cfg.wandb.mode,
SWANLAB_MODE=exp_cfg.swanlab.mode,
REAL_MODE="ray",
REAL_RECOVER_RUN="0",
REAL_SAVE_RECOVER_STATES="1",