AReaL/training/configs/sft.yaml

97 lines
1.9 KiB
YAML

# Basic experiment info
experiment_name: sft
trial_name: my-trial
seed: 1
mode: ray
metric_discovery_port: 17997
wandb:
mode: disabled
entity: null
project: null
name: null
job_type: null
group: null
notes: null
tags: null
config: null
tensorboard:
path: null
recover_mode: auto
recover_retries: 10
recover_after: 10
exp_ctrl:
total_train_epochs: 5
save_freq_epochs: 1
save_freq_steps: null
save_freq_secs: null
ckpt_freq_epochs: null
ckpt_freq_steps: null
ckpt_freq_secs: 600
eval_freq_epochs: null
eval_freq_steps: null
eval_freq_secs: null
benchmark_steps: null
benchmark_n_seqs: null
torch_cache_mysophobia: true
cache_clear_freq: 1
# Allocation and parallelism
allocation_mode: d4p2m1
n_nodes: 1
n_gpus_per_node: 8
# Cluster configuration
ray_temp_path: /tmp/ray
cluster:
fileroot: /storage/ray/experiments
n_nodes: 32
n_gpus_per_node: 8
# Model
model:
type:
_class: qwen2
path: /storage/models/DeepSeek-R1-Distill-Qwen-7B
init_from_scratch: false
gradient_checkpointing: true
bf16: false
optimizer:
type: adam
lr: 1.0e-05
weight_decay: 0.1
beta1: 0.9
beta2: 0.95
eps: 1.0e-05
min_lr_ratio: 0.0
lr_scheduler_type: constant
warmup_steps_proportion: 0.03
offload: false
initial_loss_scale: 262144.0
min_loss_scale: 1.0
loss_scale_window: 10.0
hysteresis: 2
gradient_clipping: 1.0
megatron:
ddp:
grad_reduce_in_fp32: true
overlap_grad_reduce: true
use_distributed_optimizer: true
allocation:
mb_spec:
n_mbs: 1
max_tokens_per_mb: 32768
# Dataset
dataset:
train_path: /storage/datasets/boba-sft_200_0319.jsonl
valid_path: /storage/datasets/boba-sft_200_0319.jsonl
max_seqlen: 32768
train_bs_n_seqs: 16
valid_bs_n_seqs: 16
# worker resources
cpus_per_master_worker: 4
mem_per_master_worker: 20000
cpus_per_model_worker: 4
mem_per_model_worker: 90000