AReaL/training/configs/async-ppo/async-ppo-1.7b-gpu32.yaml

66 lines
1.3 KiB
YAML

max_head_offpolicyness: 4
experiment_name: async-ppo-1.7b-gpu32
trial_name: my-trial
mode: ray
cluster:
fileroot: /storage/ray/experiments
wandb:
mode: disabled
recover_mode: auto
recover_retries: 10
allocation_mode: sglang.d24p1m1+d4p2m1
n_nodes: 4
n_gpus_per_node: 8
cache_clear_freq: 1
exp_ctrl:
total_train_epochs: 5
save_freq_epochs: 1
ckpt_freq_secs: 600
torch_cache_mysophobia: true
dataset:
path: /storage/datasets/boba_106k_0319.jsonl
max_prompt_len: 1024
train_bs_n_seqs: 512
group_size: 16
group_adv_norm: false
actor:
type:
_class: qwen3
path: /storage/openpsi/models/Qwen3-1.7B/
optimizer:
lr: 2e-05
lr_scheduler_type: constant
eps: 1e-5
warmup_steps_proportion: 0.001
hysteresis: 2
sglang:
mem_fraction_static: 0.8
actor_train:
mb_spec:
max_tokens_per_mb: 30720
actor_gen:
mb_spec:
max_tokens_per_mb: 30720
actor_inf:
mb_spec:
max_tokens_per_mb: 30720
ppo:
gen:
max_new_tokens: 27648
min_new_tokens: 0
top_p: 1.0
top_k: 1000000
temperature: 1.0
ppo_n_minibatches: 4
kl_ctl: 0.0
discount: 1.0
value_eps_clip: 0.2
disable_value: true
reward_output_scaling: 5
reward_output_bias: 0.0
adv_norm: true
value_norm: true
recompute_logprob: true
use_decoupled_loss: true