AReaL/examples/configs/v0.2-qwen2-math/7B-distill/ppo-7B-distill-gpus-32.yaml

experiment_name: ppo-7B-distill-gpus-32
trial_name: 512x16
mode: ray
wandb:
  mode: disabled
metric_discovery_port: 17997
recover_mode: auto
recover_retries: 10
allocation_mode: 'sglang.d16p1m1+d8p2m1'
n_nodes: 4
n_gpus_per_node: 8
cache_clear_freq: 1
exp_ctrl:
  total_train_epochs: 5
  save_freq_epochs: 1
  ckpt_freq_secs: 600
torch_cache_mysophobia: true
actor:
  type:
    _class: qwen2
  path: '/storage/models/DeepSeek-R1-Distill-Qwen-7B'
  optimizer:
    lr: 2e-05
    lr_scheduler_type: constant
    eps: 1e-5
    warmup_steps_proportion: 0.001
    hysteresis: 2
  sglang:
    mem_fraction_static: 0.8
    triton_attention_num_kv_splits: 16
    enable_metrics: True
ref:
  type:
    _class: qwen2
  path: '/storage/models/DeepSeek-R1-Distill-Qwen-7B'
actor_train:
  mb_spec:
    max_tokens_per_mb: 30720
actor_inf:
  mb_spec:
    max_tokens_per_mb: 30720
ref_inf:
  mb_spec:
    max_tokens_per_mb: 30720
dataset:
  path: '/storage/datasets/boba_106k_0319.jsonl'
  max_prompt_len: 1024
  train_bs_n_seqs: 512
ppo:
  gen:
    max_new_tokens: 27648
    min_new_tokens: 0
    top_p: 1.0
    top_k: 1000000
    temperature: 1.0
  ppo_n_minibatches: 4
  kl_ctl: 0.0
  discount: 1.0
  value_eps_clip: 0.2
  disable_value: true
  reward_output_scaling: 5
  reward_output_bias: 0.0
  adv_norm: true
group_size: 16
group_adv_norm: false

# Cluster configuration
ray_temp_path: /tmp/ray
cluster:
  fileroot: /tmp/ray/experiments
  n_nodes: 32
  n_gpus_per_node: 8