This commit is contained in:
bowei.fw 2025-07-14 10:09:24 +08:00
parent ae20d51cce
commit 29172e0e10
5 changed files with 16 additions and 7 deletions

View File

@ -49,7 +49,7 @@ from arealite.utils.fsdp import (
from arealite.utils.model import disable_dropout_in_model
from arealite.utils.save_load import get_state_dict_from_repo_id_or_path
from realhf.api.core.data_api import load_hf_tokenizer
from realhf.base import logging, name_resolve, names, pkg_version
from realhf.base import logging, name_resolve, names, pkg_version, constants
logger = logging.getLogger("FSDPEngine")
@ -91,7 +91,7 @@ class FSDPEngine(TrainEngine):
"""Initialize distributed communication and model."""
if not dist.is_initialized():
# TODO: Handle the condition when WORLD_SIZE and RANK is not set in launcher
dist.init_process_group(backend="nccl")
dist.init_process_group(backend="nccl", timeout=constants.NCCL_DEFAULT_TIMEOUT)
# TODO: Handle the condition when LOCAL_RANK is not set in launcher
torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))

View File

@ -353,6 +353,10 @@ class RemoteSGLangEngine(InferenceEngine):
stop_reason = finish_reason["type"]
payload["input_ids"] += result[SGLANG_TOKEN_OUTPUT_IDENTIFIER]
sample_params["max_new_tokens"] = min(
sample_params["max_new_tokens"],
gconfig.max_new_tokens - len(output_tokens),
)
latency = time.perf_counter() - start_time

View File

@ -357,7 +357,7 @@ class SlurmLauncher:
# Prepare the command for each job in the array
job_cmd = cmd[i]
# FIXME: only for debugging, remove and replace new image
job_cmd = f'bash -c "pip3 install -r requirements.txt; {job_cmd}"'
# job_cmd = f'bash -c "pip3 install -r requirements.txt; {job_cmd}"'
srun_cmd = SRUN_CMD_TEMPLATE.format(
nodes=1,

View File

@ -10,8 +10,8 @@ cluster:
name_resolve:
type: etcd3
etcd3_addr: etcd-client.openpsi-etcd.svc.sigma-na130-lingbo.na130.wl-robby.local:2379
gpu_image: /storage/openpsi/images/areal-v0.3.0.post1.sif
gpu_infer_image: /storage/openpsi/images/areal-v0.3.0.post1.sif
gpu_image: /storage/openpsi/images/arealite-20250712-update-hf-xet.sif
gpu_infer_image: /storage/openpsi/images/arealite-20250712-update-hf-xet.sif
seed: 1
total_train_epochs: 10
tokenizer_path: ${actor.path}
@ -92,7 +92,7 @@ sglang:
# datasets
train_dataset:
batch_size: 16
batch_size: 128
shuffle: true
pin_memory: true
@ -117,7 +117,7 @@ evaluator:
experiment_name: ${experiment_name}
trial_name: ${trial_name}
fileroot: ${cluster.fileroot}
freq_epochs: 1
freq_epochs: null
freq_steps: null
freq_secs: null

5
run.sh Normal file
View File

@ -0,0 +1,5 @@
#!/bin/bash
WANDB_API_KEY=local-5dd08fc1894114d0bea728566d5c35c5b31ee608 \
WANDB_BASE_URL=http://8.150.1.98:8080 \
python3 -m arealite.launcher.slurm examples/arealite/boba.py --config examples/arealite/configs/boba.yaml \
trial_name=run0713-6