AReaL/arealite/config/async_grpo.yaml

# Basic experiment info
experiment_name: gsm8k-test
trial_name: my-trial-3
seed: 1
mode: local
wandb:
  mode: disabled
  entity: null
  project: null
  name: null
  job_type: null
  group: null
  notes: null
  tags: null
  config: null
tensorboard:
  path: null

exp_ctrl:
  total_train_epochs: 5
  save_freq_epochs: 1
  save_freq_steps: null
  save_freq_secs: null
  ckpt_freq_epochs: null
  ckpt_freq_steps: null
  ckpt_freq_secs: 600
  eval_freq_epochs: null
  eval_freq_steps: null
  eval_freq_secs: null
  benchmark_steps: null
  benchmark_n_seqs: null

# whether to allow persistent servers
shutdown_server_on_exit: true

# Allocation and parallelism
allocation_mode: sglang.d4p1t1+d4p1t1
n_nodes: 1
n_gpus_per_node: 8

# Cluster configuration
ray_temp_path: /tmp/ray
cluster:
  cluster_name: local
  fileroot: /tmp/arealite/
  n_nodes: 1
  n_gpus_per_node: 8
  name_resolve:
    type: nfs
    nfs_record_root: /tmp/arealite/name_resolve/

# Datasets
train_dataset:
  path: json
  name: null
  split: train
  data_files: /storage/openpsi/users/xushusheng.xss/training_data/boba_106k_0319.jsonl
  batch_size: 32
  shuffle: True
  preprocessor:
    type: areal

valid_dataset: null

# Rollout config
rollout:
  collector:
    type: rlvr
    rlvr:
      reward_type: areal-math
      solution_path: /storage/openpsi/users/xushusheng.xss/training_data/boba_106k_0319.jsonl
  num_workers: 1
  max_concurrent_rollouts: null
  max_head_offpolicyness: 0
  filter_reward_lb: -10000
  filter_reward_ub: 10000
  server_backend: sglang
  model_path: /storage/openpsi/models/Qwen__Qwen3-1.7B/
  gconfig:
    n_samples: 16
    max_new_tokens: 512
    min_new_tokens: 0
    top_p: 1.0
    top_k: 1000000
    temperature: 1.0
  llm_client:
    schedule_policy: round_robin
    request_timeout: 3600
    request_retries: 3
  llm_service:
    served_model_name: null
    health_check_interval: 5
    startup_timeout: 300
    max_unhealth_count: 3
    graceful_shutdown_on_unhealthy: true
  sglang:
    dtype: "bfloat16"
    enable_mixed_chunk: false
    enable_torch_compile: false
    torch_compile_max_bs: 32
    cuda_graph_max_bs: null
    cuda_graph_bs: null
    triton_attention_reduce_in_fp32: false
    triton_attention_num_kv_splits: 8
    num_continuous_decode_steps: 1
    attention_backend: "flashinfer"
    sampling_backend: null
    context_length: 32768
    mem_fraction_static: 0.9
    max_running_requests: null
    chunked_prefill_size: -1
    max_prefill_tokens: 32768
    schedule_policy: "lpm"
    schedule_conservativeness: 1.0
    cpu_offload_gb: 0
    kv_cache_dtype: "auto"
    log_level: "warning"
    log_level_http: "warning"
    log_requests: false
    log_requests_level: 0
    show_time_cost: false
    enable_metrics: true
    decode_log_interval: 1

# Trainer
trainer:
  type: grpo
  grpo:
    async_training: true
    actor:
      path: /storage/openpsi/models/Qwen__Qwen3-1.7B/
      init_from_scratch: false
      gradient_checkpointing: false
      bf16: true
      optimizer:
        type: adam
        lr: 1.0e-6
        weight_decay: 0.05
        beta1: 0.9
        beta2: 0.999
        eps: 1.0e-08
        min_lr_ratio: 0.0
        lr_scheduler_type: constant
        warmup_steps_proportion: 0.001
        initial_loss_scale: 4294967296.0
        min_loss_scale: 1.0
        loss_scale_window: 5.0
        hysteresis: 2
        gradient_clipping: 1.0
      backend:
        type: fsdp
    ref: null
    mb_spec:
      max_tokens_per_mb: 10240
    # Algorithm
    group_adv_norm: False
    ppo_n_minibatches: 4
    eps_clip: 0.2
    c_clip: null
    reward_scaling: 10.0
    reward_bias: -0.5
    max_reward_clip: 20.0
    mask_no_eos_with_zero: false
    discount: 1.0
    gae_lambda: 1.0
    adv_norm: true
    kl_ctl: 0.0
    recompute_logprob: true
    use_decoupled_loss: true
    behav_imp_weight_cap: null