AReaL/arealite/config/async_grpo.yaml

172 lines
3.8 KiB
YAML
Executable File

# Basic experiment info
experiment_name: gsm8k-test
trial_name: my-trial-3
seed: 1
mode: local
wandb:
mode: disabled
entity: null
project: null
name: null
job_type: null
group: null
notes: null
tags: null
config: null
tensorboard:
path: null
exp_ctrl:
total_train_epochs: 5
save_freq_epochs: 1
save_freq_steps: null
save_freq_secs: null
ckpt_freq_epochs: null
ckpt_freq_steps: null
ckpt_freq_secs: 600
eval_freq_epochs: null
eval_freq_steps: null
eval_freq_secs: null
benchmark_steps: null
benchmark_n_seqs: null
# whether to allow persistent servers
shutdown_server_on_exit: true
# Allocation and parallelism
allocation_mode: sglang.d4p1t1+d4p1t1
n_nodes: 1
n_gpus_per_node: 8
# Cluster configuration
ray_temp_path: /tmp/ray
cluster:
cluster_name: local
fileroot: /tmp/arealite/
n_nodes: 1
n_gpus_per_node: 8
name_resolve:
type: nfs
nfs_record_root: /tmp/arealite/name_resolve/
# Datasets
train_dataset:
path: json
name: null
split: train
data_files: /storage/openpsi/users/xushusheng.xss/training_data/boba_106k_0319.jsonl
batch_size: 32
shuffle: True
preprocessor:
type: areal
valid_dataset: null
# Rollout config
rollout:
collector:
type: rlvr
rlvr:
reward_type: areal-math
solution_path: /storage/openpsi/users/xushusheng.xss/training_data/boba_106k_0319.jsonl
num_workers: 1
max_concurrent_rollouts: null
max_head_offpolicyness: 0
filter_reward_lb: -10000
filter_reward_ub: 10000
server_backend: sglang
model_path: /storage/openpsi/models/Qwen__Qwen3-1.7B/
gconfig:
n_samples: 16
max_new_tokens: 512
min_new_tokens: 0
top_p: 1.0
top_k: 1000000
temperature: 1.0
llm_client:
schedule_policy: round_robin
request_timeout: 3600
request_retries: 3
llm_service:
served_model_name: null
health_check_interval: 5
startup_timeout: 300
max_unhealth_count: 3
graceful_shutdown_on_unhealthy: true
sglang:
dtype: "bfloat16"
enable_mixed_chunk: false
enable_torch_compile: false
torch_compile_max_bs: 32
cuda_graph_max_bs: null
cuda_graph_bs: null
triton_attention_reduce_in_fp32: false
triton_attention_num_kv_splits: 8
num_continuous_decode_steps: 1
attention_backend: "flashinfer"
sampling_backend: null
context_length: 32768
mem_fraction_static: 0.9
max_running_requests: null
chunked_prefill_size: -1
max_prefill_tokens: 32768
schedule_policy: "lpm"
schedule_conservativeness: 1.0
cpu_offload_gb: 0
kv_cache_dtype: "auto"
log_level: "warning"
log_level_http: "warning"
log_requests: false
log_requests_level: 0
show_time_cost: false
enable_metrics: true
decode_log_interval: 1
# Trainer
trainer:
type: grpo
grpo:
async_training: true
actor:
path: /storage/openpsi/models/Qwen__Qwen3-1.7B/
init_from_scratch: false
gradient_checkpointing: false
bf16: true
optimizer:
type: adam
lr: 1.0e-6
weight_decay: 0.05
beta1: 0.9
beta2: 0.999
eps: 1.0e-08
min_lr_ratio: 0.0
lr_scheduler_type: constant
warmup_steps_proportion: 0.001
initial_loss_scale: 4294967296.0
min_loss_scale: 1.0
loss_scale_window: 5.0
hysteresis: 2
gradient_clipping: 1.0
backend:
type: fsdp
ref: null
mb_spec:
max_tokens_per_mb: 10240
# Algorithm
group_adv_norm: False
ppo_n_minibatches: 4
eps_clip: 0.2
c_clip: null
reward_scaling: 10.0
reward_bias: -0.5
max_reward_clip: 20.0
mask_no_eos_with_zero: false
discount: 1.0
gae_lambda: 1.0
adv_norm: true
kl_ctl: 0.0
recompute_logprob: true
use_decoupled_loss: true
behav_imp_weight_cap: null