mirror of https://github.com/inclusionAI/AReaL
This commit is contained in:
parent
ae20d51cce
commit
29172e0e10
|
@ -49,7 +49,7 @@ from arealite.utils.fsdp import (
|
|||
from arealite.utils.model import disable_dropout_in_model
|
||||
from arealite.utils.save_load import get_state_dict_from_repo_id_or_path
|
||||
from realhf.api.core.data_api import load_hf_tokenizer
|
||||
from realhf.base import logging, name_resolve, names, pkg_version
|
||||
from realhf.base import logging, name_resolve, names, pkg_version, constants
|
||||
|
||||
logger = logging.getLogger("FSDPEngine")
|
||||
|
||||
|
@ -91,7 +91,7 @@ class FSDPEngine(TrainEngine):
|
|||
"""Initialize distributed communication and model."""
|
||||
if not dist.is_initialized():
|
||||
# TODO: Handle the condition when WORLD_SIZE and RANK is not set in launcher
|
||||
dist.init_process_group(backend="nccl")
|
||||
dist.init_process_group(backend="nccl", timeout=constants.NCCL_DEFAULT_TIMEOUT)
|
||||
|
||||
# TODO: Handle the condition when LOCAL_RANK is not set in launcher
|
||||
torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
|
||||
|
|
|
@ -353,6 +353,10 @@ class RemoteSGLangEngine(InferenceEngine):
|
|||
stop_reason = finish_reason["type"]
|
||||
|
||||
payload["input_ids"] += result[SGLANG_TOKEN_OUTPUT_IDENTIFIER]
|
||||
sample_params["max_new_tokens"] = min(
|
||||
sample_params["max_new_tokens"],
|
||||
gconfig.max_new_tokens - len(output_tokens),
|
||||
)
|
||||
|
||||
latency = time.perf_counter() - start_time
|
||||
|
||||
|
|
|
@ -357,7 +357,7 @@ class SlurmLauncher:
|
|||
# Prepare the command for each job in the array
|
||||
job_cmd = cmd[i]
|
||||
# FIXME: only for debugging, remove and replace new image
|
||||
job_cmd = f'bash -c "pip3 install -r requirements.txt; {job_cmd}"'
|
||||
# job_cmd = f'bash -c "pip3 install -r requirements.txt; {job_cmd}"'
|
||||
|
||||
srun_cmd = SRUN_CMD_TEMPLATE.format(
|
||||
nodes=1,
|
||||
|
|
|
@ -10,8 +10,8 @@ cluster:
|
|||
name_resolve:
|
||||
type: etcd3
|
||||
etcd3_addr: etcd-client.openpsi-etcd.svc.sigma-na130-lingbo.na130.wl-robby.local:2379
|
||||
gpu_image: /storage/openpsi/images/areal-v0.3.0.post1.sif
|
||||
gpu_infer_image: /storage/openpsi/images/areal-v0.3.0.post1.sif
|
||||
gpu_image: /storage/openpsi/images/arealite-20250712-update-hf-xet.sif
|
||||
gpu_infer_image: /storage/openpsi/images/arealite-20250712-update-hf-xet.sif
|
||||
seed: 1
|
||||
total_train_epochs: 10
|
||||
tokenizer_path: ${actor.path}
|
||||
|
@ -92,7 +92,7 @@ sglang:
|
|||
|
||||
# datasets
|
||||
train_dataset:
|
||||
batch_size: 16
|
||||
batch_size: 128
|
||||
shuffle: true
|
||||
pin_memory: true
|
||||
|
||||
|
@ -117,7 +117,7 @@ evaluator:
|
|||
experiment_name: ${experiment_name}
|
||||
trial_name: ${trial_name}
|
||||
fileroot: ${cluster.fileroot}
|
||||
freq_epochs: 1
|
||||
freq_epochs: null
|
||||
freq_steps: null
|
||||
freq_secs: null
|
||||
|
||||
|
|
Loading…
Reference in New Issue