mirror of https://github.com/inclusionAI/AReaL
Support using SwanLab for experiment tracking (#98)
* Support using SwanLab for experiment tracking * docs: improve WandB and SwanLab integration documentation - Added official links for better user reference - Used backticks to quote commands and parameters - Unified mode settings to use "online" / "cloud" convention - Merged WandB and SwanLab descriptions into a single concise statement - Added note on using `swanlab.mode="local"` when server connection is unavailable * refactor: update default value of api_key * fix: correct help description from WandB to SwanLab in SwanLabConfig * refactor: merge log_swanlab_tensorboard and log_wandb_tensorboard into log_swanlab_wandb_tensorboard - Unified logging logic for SwanLab, WandB, and TensorBoard to reduce code duplication * chore: update swanlab version in dependency config files - Updated SwanLab version in pyproject.toml - Updated SwanLab version in requirements.txt * refactor: enhance SwanLab config handling for logging purposes - Config now uses provided arguments first - Falls back to reading from config.yaml if no input is given * docs: add note on using when server connection is unavailable * refactor: merge _LATEST_WANDB_STEP and _LATEST_SWANLAB_STEP into _LATEST_LOG_STEP * Format code with black and isort * chore: update swanlab version in dependency config files - Updated SwanLab version in requirements.txt * refactor: rename swanlab_wandb_data to log_data --------- Co-authored-by: dubingnan <dubingnan@360.cn>
This commit is contained in:
parent
f2f4b67bcd
commit
bb14f022dc
|
@ -97,12 +97,15 @@ python3 training/main_sync_ppo.py --help
|
|||
|
||||
## Monitoring the Training Process
|
||||
|
||||
We recommend using Weights & Biases (wandb) for monitoring. Run `wandb login` or set the `WANDB_API_KEY` environment variable. Set `wandb.mode=online` in your configuration to upload training statistics.
|
||||
+ We recommend using [Weights & Biases (wandb)](https://github.com/wandb/wandb) or [SwanLab](https://github.com/SwanHubX/SwanLab) for monitoring—run `wandb login` or `swanlab login`, or set the corresponding environment variable API key (`WANDB_API_KEY` or `SWANLAB_API_KEY`). Set `wandb.mode="online"` or `swanlab.mode="cloud"` in your configuration to upload training statistics. If you cannot connect to the server, you can also use `wandb.mode="offline"` or `swanlab.mode="local"` to save data locally without uploading.
|
||||
|
||||
|
||||
You can also use TensorBoard by setting the `tensorboard.path` parameter.
|
||||
|
||||
The main log will be saved to `${fileroot}/logs/${USER}/${experiment_name}/${trial_name}/main.log` and contains the statistics uploaded to wandb.
|
||||
|
||||
If SwanLab is enabled, logs will be saved to the directory specified by `swanlab.logdir`.
|
||||
|
||||
### Key Training Statistics
|
||||
|
||||
- **`Epoch 1/5`**: Indicates the total epochs required and the current epoch being trained.
|
||||
|
|
|
@ -15,3 +15,4 @@ prettytable
|
|||
timeout-decorator
|
||||
timeout_decorator
|
||||
wandb
|
||||
swanlab[dashboard]
|
|
@ -61,6 +61,7 @@ dependencies = [
|
|||
"colorlog",
|
||||
"psutil",
|
||||
"pynvml",
|
||||
"swanlab[dashboard]",
|
||||
|
||||
# Performance and compression
|
||||
"ninja",
|
||||
|
|
|
@ -848,6 +848,16 @@ class WandBConfig:
|
|||
config: Optional[Dict] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class SwanlabConfig:
|
||||
project: Optional[str] = None
|
||||
name: Optional[str] = None
|
||||
config: Optional[Dict] = None
|
||||
logdir: Optional[str] = None
|
||||
mode: Optional[str] = "local"
|
||||
api_key: Optional[str] = os.getenv("SWANLAB_API_KEY", None)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TensorBoardConfig:
|
||||
path: Optional[str] = None
|
||||
|
@ -951,6 +961,10 @@ class BaseExperimentConfig:
|
|||
default_factory=WandBConfig,
|
||||
metadata={"help": "Weights & Biases configuration."},
|
||||
)
|
||||
swanlab: SwanlabConfig = field(
|
||||
default_factory=SwanlabConfig,
|
||||
metadata={"help": "SwanLab configuration."},
|
||||
)
|
||||
tensorboard: TensorBoardConfig = field(
|
||||
default_factory=TensorBoardConfig,
|
||||
metadata={"help": "TensorBoard configuration. Only 'path' field required."},
|
||||
|
@ -1026,7 +1040,7 @@ class BaseExperimentConfig:
|
|||
default=False,
|
||||
metadata={
|
||||
"help": "Enable automatic evaluation during training. "
|
||||
"Results logged to disk and WandB (if active)."
|
||||
"Results logged to disk and WandB or Swanlab(if active)."
|
||||
},
|
||||
)
|
||||
auto_eval_config: AutomaticEvaluator = field(
|
||||
|
|
|
@ -11,6 +11,7 @@ import realhf.api.core.dfg as dfg
|
|||
from realhf.api.cli_args import (
|
||||
AutomaticEvaluator,
|
||||
ExperimentSaveEvalControl,
|
||||
SwanlabConfig,
|
||||
TensorBoardConfig,
|
||||
WandBConfig,
|
||||
)
|
||||
|
@ -254,6 +255,7 @@ class ExperimentScheduling:
|
|||
class ExperimentConfig:
|
||||
exp_ctrl: ExperimentSaveEvalControl
|
||||
wandb: WandBConfig
|
||||
swanlab: SwanlabConfig
|
||||
tensorboard: TensorBoardConfig
|
||||
# dataflow
|
||||
model_rpcs: List[dfg.MFCDef]
|
||||
|
|
|
@ -94,7 +94,7 @@ def main_start(args, job_group_id: str = "", recover_count: int = 0):
|
|||
raise RuntimeError("Experiment initial setup failed.") from e
|
||||
|
||||
evaluator = (
|
||||
AutomaticEvaluator(exp_cfg.evaluator, exp_cfg.wandb)
|
||||
AutomaticEvaluator(exp_cfg.evaluator, exp_cfg.wandb, exp_cfg.swanlab)
|
||||
if exp_cfg.auto_eval
|
||||
else None
|
||||
)
|
||||
|
|
|
@ -141,19 +141,29 @@ def getLogger(
|
|||
return logging.getLogger(name)
|
||||
|
||||
|
||||
_LATEST_WANDB_STEP = 0
|
||||
_LATEST_LOG_STEP = 0
|
||||
|
||||
|
||||
def log_wandb_tensorboard(data, step=None, summary_writer=None):
|
||||
def log_swanlab_wandb_tensorboard(data, step=None, summary_writer=None):
|
||||
# Logs data to SwanLab、 wandb、 TensorBoard.
|
||||
|
||||
global _LATEST_LOG_STEP
|
||||
if step is None:
|
||||
step = _LATEST_LOG_STEP
|
||||
else:
|
||||
_LATEST_LOG_STEP = max(_LATEST_LOG_STEP, step)
|
||||
|
||||
# swanlab
|
||||
import swanlab
|
||||
|
||||
swanlab.log(data, step=step)
|
||||
|
||||
# wandb
|
||||
import wandb
|
||||
|
||||
global _LATEST_WANDB_STEP
|
||||
if step is None:
|
||||
step = _LATEST_WANDB_STEP
|
||||
else:
|
||||
_LATEST_WANDB_STEP = max(_LATEST_WANDB_STEP, step)
|
||||
|
||||
wandb.log(data, step=step)
|
||||
|
||||
# tensorboard
|
||||
if summary_writer is not None:
|
||||
for key, val in data.items():
|
||||
summary_writer.add_scalar(f"{key}", val, step)
|
||||
|
|
|
@ -331,6 +331,7 @@ class AsyncRLExperimentConfig(CommonExperimentConfig, AsyncRLOptions):
|
|||
return ExperimentConfig(
|
||||
exp_ctrl=self.exp_ctrl,
|
||||
wandb=self.wandb,
|
||||
swanlab=self.swanlab,
|
||||
tensorboard=self.tensorboard,
|
||||
# NOTE: master and model worker only see RPCs without generation
|
||||
model_rpcs=[
|
||||
|
|
|
@ -564,6 +564,7 @@ class CommonExperimentConfig(BaseExperimentConfig, Experiment):
|
|||
return ExperimentConfig(
|
||||
exp_ctrl=self.exp_ctrl,
|
||||
wandb=self.wandb,
|
||||
swanlab=self.swanlab,
|
||||
tensorboard=self.tensorboard,
|
||||
model_rpcs=[rpc_alloc.rpc for rpc_alloc in rpc_allocs],
|
||||
model_worker=model_worker,
|
||||
|
|
|
@ -370,6 +370,7 @@ class PPOMATHConfig(CommonExperimentConfig, PPOMATHExperimentOptions):
|
|||
return ExperimentConfig(
|
||||
exp_ctrl=self.exp_ctrl,
|
||||
wandb=self.wandb,
|
||||
swanlab=self.swanlab,
|
||||
tensorboard=self.tensorboard,
|
||||
model_rpcs=[rpc_alloc.rpc for rpc_alloc in rpc_allocs],
|
||||
model_worker=model_worker,
|
||||
|
|
|
@ -8,6 +8,7 @@ import subprocess
|
|||
import time
|
||||
from typing import Dict, Optional
|
||||
|
||||
import swanlab
|
||||
import wandb
|
||||
|
||||
import realhf.api.core.system_api as config_pkg
|
||||
|
@ -125,13 +126,15 @@ class EvaluationStep:
|
|||
self.status = EvaluationStepStatus.FAILED
|
||||
return False
|
||||
|
||||
wandb_data = {}
|
||||
log_data = {}
|
||||
for data_name, d in data.items():
|
||||
for k, v in d.items():
|
||||
wandb_data[f"{data_name}_{k}"] = v
|
||||
wandb.log(wandb_data, step=self.global_step)
|
||||
log_data[f"{data_name}_{k}"] = v
|
||||
wandb.log(log_data, step=self.global_step)
|
||||
swanlab.log(log_data, step=self.global_step)
|
||||
self.status = EvaluationStepStatus.LOGGED
|
||||
logger.info(f"Logging eval result {wandb_data} to step {self.global_step}")
|
||||
logger.info(f"Logging eval result {log_data} to step {self.global_step}")
|
||||
|
||||
return True
|
||||
|
||||
def check(self):
|
||||
|
@ -154,13 +157,15 @@ class AutomaticEvaluator:
|
|||
self,
|
||||
config: config_pkg.AutomaticEvaluator,
|
||||
wandb_config: config_pkg.WandBConfig,
|
||||
swanlab_config: config_pkg.SwanlabConfig,
|
||||
):
|
||||
self.__eval_steps: Dict[int, EvaluationStep] = {}
|
||||
self.__max_concurrent_jobs = config.max_concurrent_jobs
|
||||
self.__wandb_config = wandb_config
|
||||
self.__swanlab_config = swanlab_config
|
||||
self.__config = config
|
||||
self.__wandb_initialized = False
|
||||
|
||||
self.__swanlab_initialized = False
|
||||
# Check evaluated checkpoints by logs in recover
|
||||
# NOTE: All previous evaluation steps with output will be marked
|
||||
# as logged, even if it is not really logged in wandb.
|
||||
|
@ -228,6 +233,40 @@ class AutomaticEvaluator:
|
|||
settings=wandb.Settings(start_method="fork"),
|
||||
)
|
||||
|
||||
def __lazy_swanlab_init(self):
|
||||
if self.__swanlab_config.api_key:
|
||||
swanlab.login(self.__swanlab_config.api_key)
|
||||
if self.swanlab_config.config is None:
|
||||
import yaml
|
||||
|
||||
with open(
|
||||
os.path.join(
|
||||
constants.LOG_ROOT,
|
||||
constants.experiment_name(),
|
||||
constants.trial_name(),
|
||||
"config.yaml",
|
||||
),
|
||||
"r",
|
||||
) as f:
|
||||
__config = yaml.safe_load(f)
|
||||
else:
|
||||
__config = self.swanlab_config.config
|
||||
__config["FRAMEWORK"] = "AReaL"
|
||||
swanlab.init(
|
||||
project=self.__swanlab_config.project or constants.experiment_name(),
|
||||
experiment_name=self.__swanlab_config.name
|
||||
or f"{constants.trial_name()}_eval",
|
||||
config=__config,
|
||||
logdir=self.__swanlab_config.logdir
|
||||
or os.path.join(
|
||||
constants.LOG_ROOT,
|
||||
constants.experiment_name(),
|
||||
constants.trial_name(),
|
||||
"swanlab",
|
||||
),
|
||||
mode=self.__swanlab_config.mode,
|
||||
)
|
||||
|
||||
def step(self):
|
||||
# Check whether a new evaluation step should be created
|
||||
ckpt_parent = os.path.join(
|
||||
|
@ -292,6 +331,9 @@ class AutomaticEvaluator:
|
|||
if not self.__wandb_initialized:
|
||||
self.__lazy_wandb_init()
|
||||
self.__wandb_initialized = True
|
||||
if not self.__swanlab_initialized:
|
||||
self.__lazy_swanlab_init()
|
||||
self.__swanlab_initialized = True
|
||||
self.__eval_steps[log_step].log(self.__config)
|
||||
|
||||
@property
|
||||
|
|
|
@ -12,6 +12,7 @@ from typing import Dict
|
|||
import colorama
|
||||
import networkx as nx
|
||||
import numpy as np
|
||||
import swanlab
|
||||
import wandb
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
|
@ -312,6 +313,40 @@ class MasterWorker(worker_base.AsyncWorker):
|
|||
resume="allow",
|
||||
settings=wandb.Settings(start_method="fork"),
|
||||
)
|
||||
|
||||
# swanlab init, connect to remote or local swanlab host
|
||||
if self.swanlab_config.mode != "disabled" and self.swanlab_config.api_key:
|
||||
swanlab.login(self.swanlab_config.api_key)
|
||||
if self.swanlab_config.config is None:
|
||||
import yaml
|
||||
|
||||
with open(
|
||||
os.path.join(
|
||||
constants.LOG_ROOT,
|
||||
constants.experiment_name(),
|
||||
constants.trial_name(),
|
||||
"config.yaml",
|
||||
),
|
||||
"r",
|
||||
) as f:
|
||||
__config = yaml.safe_load(f)
|
||||
else:
|
||||
__config = self.swanlab_config.config
|
||||
__config["FRAMEWORK"] = "AReaL"
|
||||
swanlab.init(
|
||||
project=self.swanlab_config.project or constants.experiment_name(),
|
||||
experiment_name=self.swanlab_config.name
|
||||
or f"{constants.trial_name()}_train",
|
||||
config=__config,
|
||||
logdir=self.swanlab_config.logdir
|
||||
or os.path.join(
|
||||
constants.LOG_ROOT,
|
||||
constants.experiment_name(),
|
||||
constants.trial_name(),
|
||||
"swanlab",
|
||||
),
|
||||
mode=self.swanlab_config.mode,
|
||||
)
|
||||
# tensorboard logging
|
||||
self.__summary_writer = None
|
||||
if self.tensorboard_config.path is not None:
|
||||
|
@ -487,7 +522,7 @@ class MasterWorker(worker_base.AsyncWorker):
|
|||
s += f"(global step {global_step}) finishes. "
|
||||
s += f"#End to end# execution time: *{e2e_time:.3f}*s. "
|
||||
s += f"Total time consumption: {time_since_configure:.3f}s. "
|
||||
logging.log_wandb_tensorboard({"timeperf/e2e": e2e_time})
|
||||
logging.log_swanlab_wandb_tensorboard({"timeperf/e2e": e2e_time})
|
||||
if len(self.e2e_time_history) > 2:
|
||||
remaining_steps = self._steps_per_epoch - epoch_step
|
||||
remaining_epochs = self.__total_train_epochs - epoch
|
||||
|
@ -540,6 +575,7 @@ class MasterWorker(worker_base.AsyncWorker):
|
|||
)
|
||||
|
||||
wandb.finish()
|
||||
swanlab.finish()
|
||||
if self.__summary_writer is not None:
|
||||
self.__summary_writer.close()
|
||||
gc.collect()
|
||||
|
|
|
@ -10,6 +10,7 @@ import uuid
|
|||
from collections import defaultdict
|
||||
from typing import Dict, Hashable, List, Set, Tuple
|
||||
|
||||
import swanlab
|
||||
import wandb
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
|
@ -442,7 +443,7 @@ class ModelFunctionCall:
|
|||
logger.info(
|
||||
f"RPC name {rpc.name} returns\n{data_api.tabulate_stats(res)}"
|
||||
)
|
||||
logging.log_wandb_tensorboard(
|
||||
logging.log_swanlab_wandb_tensorboard(
|
||||
res,
|
||||
step=ctrl.step_info.global_step,
|
||||
summary_writer=self.summary_writer,
|
||||
|
@ -453,7 +454,7 @@ class ModelFunctionCall:
|
|||
f"RPC name {rpc.name} returns ({j + 1}/{len(res)})\n{data_api.tabulate_stats(r)}"
|
||||
)
|
||||
offset = len(res) * ctrl.step_info.global_step
|
||||
logging.log_wandb_tensorboard(
|
||||
logging.log_swanlab_wandb_tensorboard(
|
||||
r,
|
||||
step=offset + j,
|
||||
summary_writer=self.summary_writer,
|
||||
|
@ -465,11 +466,10 @@ class ModelFunctionCall:
|
|||
for time_record in time_records:
|
||||
stats_tracker.scalar(**time_record)
|
||||
time_stats = stats_tracker.export()
|
||||
logging.log_wandb_tensorboard(
|
||||
logging.log_swanlab_wandb_tensorboard(
|
||||
time_stats,
|
||||
summary_writer=self.summary_writer,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Model rpc {rpc.name} finished. "
|
||||
f"Request-reply time {time.perf_counter() - tik:.4f}s. "
|
||||
|
|
|
@ -580,7 +580,9 @@ class Worker:
|
|||
)
|
||||
expr_config.lazy_init()
|
||||
self.wandb_config = expr_config.wandb
|
||||
self.swanlab_config = expr_config.swanlab
|
||||
os.environ["WANDB_MODE"] = self.wandb_config.mode
|
||||
os.environ["SWANLAB_MODE"] = self.swanlab_config.mode
|
||||
self.tensorboard_config = expr_config.tensorboard
|
||||
config = expr_config.resolve_worker_config(
|
||||
self.__worker_type, self.__worker_index
|
||||
|
|
|
@ -68,4 +68,5 @@ python_dateutil
|
|||
word2number
|
||||
Pebble
|
||||
timeout-decorator
|
||||
prettytable
|
||||
prettytable
|
||||
swanlab[dashboard]
|
|
@ -14,6 +14,13 @@ wandb:
|
|||
notes: null
|
||||
tags: null
|
||||
config: null
|
||||
swanlab:
|
||||
mode: disabled
|
||||
api_key: null
|
||||
project: null
|
||||
name: null
|
||||
config: null
|
||||
logdir: null
|
||||
tensorboard:
|
||||
path: null
|
||||
recover_mode: auto
|
||||
|
|
|
@ -14,6 +14,13 @@ wandb:
|
|||
notes: null
|
||||
tags: null
|
||||
config: null
|
||||
swanlab:
|
||||
mode: disabled
|
||||
api_key: null
|
||||
project: null
|
||||
name: null
|
||||
config: null
|
||||
logdir: null
|
||||
tensorboard:
|
||||
path: null
|
||||
recover_mode: auto
|
||||
|
|
|
@ -14,6 +14,13 @@ wandb:
|
|||
notes: null
|
||||
tags: null
|
||||
config: null
|
||||
swanlab:
|
||||
mode: disabled
|
||||
api_key: null
|
||||
project: null
|
||||
name: null
|
||||
config: null
|
||||
logdir: null
|
||||
tensorboard:
|
||||
path: null
|
||||
recover_mode: auto
|
||||
|
|
|
@ -90,6 +90,7 @@ class RayWorker:
|
|||
worker_info.experiment_name, worker_info.trial_name
|
||||
)
|
||||
self.worker.wandb_config = expr_config.wandb
|
||||
self.worker.swanlab_config = expr_config.swanlab
|
||||
self.worker.tensorboard_config = expr_config.tensorboard
|
||||
self.logger = logging.getLogger(f"{self.worker_type} {idx}", "benchmark")
|
||||
self.logger.info(f"Configuring {self.worker_type}...")
|
||||
|
@ -125,6 +126,7 @@ def _run_experiment(exp_cfg, expr_name, trial_name):
|
|||
# Initialize ray in the Ray cluster
|
||||
env_vars = constants.get_env_vars(
|
||||
WADNB_MODE=exp_cfg.wandb.mode,
|
||||
SWANLAB_MODE=exp_cfg.swanlab.mode,
|
||||
REAL_MODE="ray",
|
||||
REAL_RECOVER_RUN="0",
|
||||
REAL_SAVE_RECOVER_STATES="1",
|
||||
|
|
Loading…
Reference in New Issue