mirror of https://github.com/inclusionAI/AReaL
slurm run
This commit is contained in:
parent
0d03141cbc
commit
43b3c3f8d0
|
@ -160,7 +160,7 @@ echo head_node_ip=$head_node_ip
|
|||
wait
|
||||
"""
|
||||
|
||||
SRUN_CMD_TEMPLATE = """srun --overlap --mpi=pmi2 -K -l --chdir $PWD --nodelist=${{nodes_array[{node_id}]}} \\
|
||||
SRUN_CMD_TEMPLATE: str = """srun --overlap --mpi=pmi2 -K -l --chdir $PWD --nodelist=${{nodes_array[{node_id}]}} \\
|
||||
--nodes={nodes} --ntasks={ntasks} --gres=gpu:{n_gpus_per_node} --cpus-per-task={cpus_per_task} \\
|
||||
--mem-per-cpu={mem_per_cpu}M {apptainer_name} exec {apptainer_options} --bind {container_mounts} \\
|
||||
{container_env_strings} \\
|
||||
|
@ -357,7 +357,7 @@ class SlurmLauncher:
|
|||
# Prepare the command for each job in the array
|
||||
job_cmd = cmd[i]
|
||||
# FIXME: only for debugging, remove and replace new image
|
||||
job_cmd = f'bash -c "pip3 install -U gymnasium torchdata tensordict hf-xet; {job_cmd}"'
|
||||
job_cmd = f'bash -c "pip3 install -r requirements.txt; {job_cmd}"'
|
||||
|
||||
srun_cmd = SRUN_CMD_TEMPLATE.format(
|
||||
nodes=1,
|
||||
|
|
|
@ -50,7 +50,6 @@ actor:
|
|||
warmup_steps_proportion: 0.001
|
||||
backend: fsdp
|
||||
|
||||
async_training: true
|
||||
group_size: ${gconfig.n_samples}
|
||||
group_adv_norm: false
|
||||
eps_clip: 0.4
|
||||
|
|
Loading…
Reference in New Issue