AReaL/examples/arealite/configs/gsm8k_sft.yaml

79 lines
1.4 KiB
YAML

experiment_name: gsm8k-sft
trial_name: trial0
cluster:
n_nodes: 1
n_gpus_per_node: 8
name_resolve:
type: nfs
nfs_record_root: /tmp/areal/name_resolve
seed: 1
total_train_epochs: 1
tokenizer_path: ${model.path}
model:
experiment_name: ${experiment_name}
trial_name: ${trial_name}
path: Qwen/Qwen3-1.7B
init_from_scratch: false
gradient_checkpointing: false
dtype: bfloat16
mb_spec:
max_tokens_per_mb: 4096
optimizer:
type: adam
lr: 2e-5
weight_decay: 0.05
beta1: 0.9
beta2: 0.95
eps: 1e-5
lr_scheduler_type: cosine
gradient_clipping: 1.0
backend: fsdp
train_dataset:
batch_size: 128
shuffle: true
pin_memory: true
num_workers: 4
path: openai/gsm8k
type: sft
valid_dataset:
batch_size: 128
shuffle: true
pin_memory: true
num_workers: 4
path: openai/gsm8k
type: sft
# Utilities
saver:
experiment_name: ${experiment_name}
trial_name: ${trial_name}
fileroot: ${cluster.fileroot}
freq_epochs: 1
freq_steps: null
freq_secs: null
checkpointer:
experiment_name: ${experiment_name}
trial_name: ${trial_name}
fileroot: ${cluster.fileroot}
freq_epochs: 1
freq_steps: null
freq_secs: 3600
evaluator:
experiment_name: ${experiment_name}
trial_name: ${trial_name}
fileroot: ${cluster.fileroot}
freq_epochs: null
freq_steps: 1
freq_secs: null
stats_logger:
experiment_name: ${experiment_name}
trial_name: ${trial_name}
fileroot: ${cluster.fileroot}