.

2025-07-14 10:09:24 +08:00 · 2025-07-14 10:09:24 +08:00 · 29172e0e10
parent ae20d51cce
commit 29172e0e10
5 changed files with 16 additions and 7 deletions
--- a/arealite/engine/fsdp_engine.py
+++ b/arealite/engine/fsdp_engine.py
@ -49,7 +49,7 @@ from arealite.utils.fsdp import (
 from arealite.utils.model import disable_dropout_in_model
 from arealite.utils.save_load import get_state_dict_from_repo_id_or_path
 from realhf.api.core.data_api import load_hf_tokenizer
-from realhf.base import logging, name_resolve, names, pkg_version
+from realhf.base import logging, name_resolve, names, pkg_version, constants

 logger = logging.getLogger("FSDPEngine")

@ -91,7 +91,7 @@ class FSDPEngine(TrainEngine):
        """Initialize distributed communication and model."""
        if not dist.is_initialized():
            # TODO: Handle the condition when WORLD_SIZE and RANK is not set in launcher
-            dist.init_process_group(backend="nccl")
+            dist.init_process_group(backend="nccl", timeout=constants.NCCL_DEFAULT_TIMEOUT)

        # TODO: Handle the condition when LOCAL_RANK is not set in launcher
        torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
--- a/arealite/engine/sglang_remote.py
+++ b/arealite/engine/sglang_remote.py
@ -353,6 +353,10 @@ class RemoteSGLangEngine(InferenceEngine):
            stop_reason = finish_reason["type"]

            payload["input_ids"] += result[SGLANG_TOKEN_OUTPUT_IDENTIFIER]
+            sample_params["max_new_tokens"] = min(
+                sample_params["max_new_tokens"],
+                gconfig.max_new_tokens - len(output_tokens),
+            )

        latency = time.perf_counter() - start_time

--- a/arealite/launcher/slurm.py
+++ b/arealite/launcher/slurm.py
@ -357,7 +357,7 @@ class SlurmLauncher:
            # Prepare the command for each job in the array
            job_cmd = cmd[i]
            # FIXME: only for debugging, remove and replace new image
-            job_cmd = f'bash -c "pip3 install -r requirements.txt; {job_cmd}"'
+            # job_cmd = f'bash -c "pip3 install -r requirements.txt; {job_cmd}"'

            srun_cmd = SRUN_CMD_TEMPLATE.format(
                nodes=1,
--- a/examples/arealite/configs/boba.yaml
+++ b/examples/arealite/configs/boba.yaml
@ -10,8 +10,8 @@ cluster:
  name_resolve:
    type: etcd3
    etcd3_addr: etcd-client.openpsi-etcd.svc.sigma-na130-lingbo.na130.wl-robby.local:2379
-  gpu_image: /storage/openpsi/images/areal-v0.3.0.post1.sif
-  gpu_infer_image: /storage/openpsi/images/areal-v0.3.0.post1.sif
+  gpu_image: /storage/openpsi/images/arealite-20250712-update-hf-xet.sif
+  gpu_infer_image: /storage/openpsi/images/arealite-20250712-update-hf-xet.sif
 seed: 1
 total_train_epochs: 10
 tokenizer_path: ${actor.path}
@ -92,7 +92,7 @@ sglang:

 # datasets
 train_dataset:
-  batch_size: 16
+  batch_size: 128
  shuffle: true
  pin_memory: true

@ -117,7 +117,7 @@ evaluator:
  experiment_name: ${experiment_name}
  trial_name: ${trial_name}
  fileroot: ${cluster.fileroot}
-  freq_epochs: 1
+  freq_epochs: null
  freq_steps: null
  freq_secs: null

--- a/run.sh
+++ b/run.sh
@ -0,0 +1,5 @@
+#!/bin/bash
+WANDB_API_KEY=local-5dd08fc1894114d0bea728566d5c35c5b31ee608 \
+WANDB_BASE_URL=http://8.150.1.98:8080 \
+    python3 -m arealite.launcher.slurm examples/arealite/boba.py --config examples/arealite/configs/boba.yaml \
+    trial_name=run0713-6