AReaL/examples/configs/v0.2-qwen2-math/7B-zero/ppo-7B-zero-gpus-128.yaml

73 lines
1.4 KiB
YAML

experiment_name: ppo-7B-zero-gpus-128
trial_name: 512x64
mode: ray
wandb:
mode: disabled
metric_discovery_port: 17997
recover_mode: auto
recover_retries: 10
allocation_mode: 'sglang.d64p1m1+d32p2m1'
n_nodes: 16
n_gpus_per_node: 8
cache_clear_freq: 1
exp_ctrl:
total_train_epochs: 5
save_freq_epochs: 1
ckpt_freq_secs: 600
torch_cache_mysophobia: true
actor:
type:
_class: qwen2
path: '/storage/models/Qwen2.5-7B'
optimizer:
lr: 1e-05
lr_scheduler_type: constant
eps: 1e-5
warmup_steps_proportion: 0.001
hysteresis: 2
sglang:
mem_fraction_static: 0.8
triton_attention_num_kv_splits: 16
enable_metrics: True
ref:
type:
_class: qwen2
path: '/storage/models/Qwen2.5-7B'
actor_train:
mb_spec:
max_tokens_per_mb: 19456
actor_inf:
mb_spec:
max_tokens_per_mb: 19456
ref_inf:
mb_spec:
max_tokens_per_mb: 19456
dataset:
path: '/storage/datasets/orz-zero_56k_0319.jsonl'
max_prompt_len: 2048
train_bs_n_seqs: 512
ppo:
gen:
max_new_tokens: 16384
min_new_tokens: 0
top_p: 1.0
top_k: 1000000
temperature: 1.0
ppo_n_minibatches: 4
kl_ctl: 0.0
discount: 1.0
value_eps_clip: 0.2
disable_value: true
reward_output_scaling: 0.5
reward_output_bias: -1.0
adv_norm: true
group_size: 64
group_adv_norm: false
# Cluster configuration
ray_temp_path: /tmp/ray
cluster:
fileroot: /tmp/ray/experiments
n_nodes: 32
n_gpus_per_node: 8