AReaL/training/configs/async-ppo.yaml

205 lines
4.3 KiB
YAML

# Basic experiment info
experiment_name: async-ppo
trial_name: my-trial
seed: 1
mode: ray
metric_discovery_port: 17997
wandb:
mode: disabled
entity: null
project: null
name: null
job_type: null
group: null
notes: null
tags: null
config: null
swanlab:
mode: disabled
api_key: null
project: null
name: null
config: null
logdir: null
tensorboard:
path: null
recover_mode: auto
recover_retries: 10
recover_after: 10
exp_ctrl:
total_train_epochs: 5
save_freq_epochs: 1
save_freq_steps: null
save_freq_secs: null
ckpt_freq_epochs: null
ckpt_freq_steps: null
ckpt_freq_secs: 600
eval_freq_epochs: null
eval_freq_steps: null
eval_freq_secs: null
benchmark_steps: null
benchmark_n_seqs: null
torch_cache_mysophobia: true
cache_clear_freq: 1
# Asynchronous RL options
max_head_offpolicyness: 4
n_rollout_workers: null
max_concurrent_rollouts: null
flush_request_timeout: 300
# Asynchronous worker resources
cpus_per_generation_server: 4
mem_per_generation_server: 61440
cpus_per_gserver_manager: 4
mem_per_gserver_manager: 10240
cpus_per_rollout_worker: 4
mem_per_rollout_worker: 20480
# Allocation and parallelism
allocation_mode: sglang.d4p1m1+d2p2m1
n_nodes: 1
n_gpus_per_node: 8
# Cluster configuration
ray_temp_path: /tmp/ray
cluster:
fileroot: /storage/ray/experiments
n_nodes: 32
n_gpus_per_node: 8
# Model
actor:
type:
_class: qwen3
path: /storage/openpsi/models/Qwen3-1.7B/
init_from_scratch: false
gradient_checkpointing: true
bf16: false
optimizer:
type: adam
lr: 2.0e-05
weight_decay: 0.05
beta1: 0.9
beta2: 0.95
eps: 1.0e-05
min_lr_ratio: 0.0
lr_scheduler_type: constant
warmup_steps_proportion: 0.001
initial_loss_scale: 4294967296.0
min_loss_scale: 1.0
loss_scale_window: 5.0
hysteresis: 2
gradient_clipping: 1.0
megatron:
ddp:
grad_reduce_in_fp32: true
overlap_grad_reduce: true
use_distributed_optimizer: true
sglang:
disable_cuda_graph: false
disable_radix_cache: false
disable_cuda_graph_padding: false
enable_nccl_nvls: false
disable_outlines_disk_cache: false
disable_custom_all_reduce: false
disable_overlap_schedule: false
enable_mixed_chunk: false
enable_torch_compile: false
torch_compile_max_bs: 32
cuda_graph_max_bs: null
cuda_graph_bs: null
torchao_config: ''
enable_nan_detection: false
enable_p2p_check: false
triton_attention_reduce_in_fp32: false
triton_attention_num_kv_splits: 8
num_continuous_decode_steps: 1
enable_memory_saver: false
allow_auto_truncate: false
attention_backend: flashinfer
sampling_backend: null
context_length: 32768
mem_fraction_static: 0.8
max_running_requests: null
chunked_prefill_size: -1
max_prefill_tokens: 32768
schedule_policy: lpm
schedule_conservativeness: 1.0
cpu_offload_gb: 0
dtype: float16
kv_cache_dtype: auto
log_level: warning
log_level_http: warning
log_requests: false
log_requests_level: 0
show_time_cost: false
enable_metrics: true
decode_log_interval: 1
ref:
type:
_class: qwen3
path: /storage/openpsi/models/Qwen3-1.7B/
init_from_scratch: false
bf16: false
actor_train:
mb_spec:
max_tokens_per_mb: 30720
ref_inf:
mb_spec:
max_tokens_per_mb: 30720
actor_inf:
mb_spec:
max_tokens_per_mb: 30720
# Dataset
shuffle_dataset: true
dataset:
path: /storage/datasets/boba_106k_0319.jsonl
max_prompt_len: 1024
train_bs_n_seqs: 512
# Algorithm
group_size: 16
mask_too_long: false
group_adv_norm: false
rw_type: sparse
success_rate_ub: 1.0
success_rate_lb: 0.0
ppo:
gen:
n: 1
max_new_tokens: 27648
min_new_tokens: 0
greedy: false
top_p: 1.0
top_k: 1000000
temperature: 1.0
ppo_n_minibatches: 4
eps_clip: 0.2
c_clip: null
value_eps_clip: 0.2
early_stop_imp_ratio: 5.0
actor_sample_reuse: 1
critic_sample_reuse: 1
max_reward_clip: 20.0
reward_output_scaling: 5.0
reward_output_bias: 0.0
fuse_rew_ref: true
discount: 1.0
gae_lambda: 1.0
adv_norm: true
kl_ctl: 0.0
use_adaptive_kl_ctl: false
disable_value: true
recompute_logprob: true
use_decoupled_loss: true
behav_imp_weight_cap: null
# worker resources
cpus_per_master_worker: 4
mem_per_master_worker: 20000
cpus_per_model_worker: 4
mem_per_model_worker: 90000