mirror of https://github.com/inclusionAI/AReaL
97 lines
1.9 KiB
YAML
97 lines
1.9 KiB
YAML
# Basic experiment info
|
|
experiment_name: sft
|
|
trial_name: my-trial
|
|
seed: 1
|
|
mode: ray
|
|
metric_discovery_port: 17997
|
|
wandb:
|
|
mode: disabled
|
|
entity: null
|
|
project: null
|
|
name: null
|
|
job_type: null
|
|
group: null
|
|
notes: null
|
|
tags: null
|
|
config: null
|
|
tensorboard:
|
|
path: null
|
|
recover_mode: auto
|
|
recover_retries: 10
|
|
recover_after: 10
|
|
|
|
exp_ctrl:
|
|
total_train_epochs: 5
|
|
save_freq_epochs: 1
|
|
save_freq_steps: null
|
|
save_freq_secs: null
|
|
ckpt_freq_epochs: null
|
|
ckpt_freq_steps: null
|
|
ckpt_freq_secs: 600
|
|
eval_freq_epochs: null
|
|
eval_freq_steps: null
|
|
eval_freq_secs: null
|
|
benchmark_steps: null
|
|
benchmark_n_seqs: null
|
|
torch_cache_mysophobia: true
|
|
cache_clear_freq: 1
|
|
|
|
# Allocation and parallelism
|
|
allocation_mode: d4p2m1
|
|
n_nodes: 1
|
|
n_gpus_per_node: 8
|
|
|
|
# Cluster configuration
|
|
ray_temp_path: /tmp/ray
|
|
cluster:
|
|
fileroot: /storage/ray/experiments
|
|
n_nodes: 32
|
|
n_gpus_per_node: 8
|
|
|
|
# Model
|
|
model:
|
|
type:
|
|
_class: qwen2
|
|
path: /storage/models/DeepSeek-R1-Distill-Qwen-7B
|
|
init_from_scratch: false
|
|
gradient_checkpointing: true
|
|
bf16: false
|
|
optimizer:
|
|
type: adam
|
|
lr: 1.0e-05
|
|
weight_decay: 0.1
|
|
beta1: 0.9
|
|
beta2: 0.95
|
|
eps: 1.0e-05
|
|
min_lr_ratio: 0.0
|
|
lr_scheduler_type: constant
|
|
warmup_steps_proportion: 0.03
|
|
offload: false
|
|
initial_loss_scale: 262144.0
|
|
min_loss_scale: 1.0
|
|
loss_scale_window: 10.0
|
|
hysteresis: 2
|
|
gradient_clipping: 1.0
|
|
megatron:
|
|
ddp:
|
|
grad_reduce_in_fp32: true
|
|
overlap_grad_reduce: true
|
|
use_distributed_optimizer: true
|
|
allocation:
|
|
mb_spec:
|
|
n_mbs: 1
|
|
max_tokens_per_mb: 32768
|
|
|
|
# Dataset
|
|
dataset:
|
|
train_path: /storage/datasets/boba-sft_200_0319.jsonl
|
|
valid_path: /storage/datasets/boba-sft_200_0319.jsonl
|
|
max_seqlen: 32768
|
|
train_bs_n_seqs: 16
|
|
valid_bs_n_seqs: 16
|
|
|
|
# worker resources
|
|
cpus_per_master_worker: 4
|
|
mem_per_master_worker: 20000
|
|
cpus_per_model_worker: 4
|
|
mem_per_model_worker: 90000 |