Merge updates from ant repository. (#18)

* fix: `self.tasks_ids` should also be filtered * PullRequest: 67 Update v0.2.0 Dockerfile Merge branch fw/v0.2.0-dockerfile of git@code.alipay.com:inclusionAI/AReaL.git into main https://code.alipay.com/inclusionAI/AReaL/pull_requests/67 Signed-off-by: 温差 <xushusheng.xss@antgroup.com> * fw/v0.2.0-dockerfile * PullRequest: 66 Update v0.2.0 cover letter Merge branch fw/v0.2.0-readme of git@code.alipay.com:inclusionAI/AReaL.git into main https://code.alipay.com/inclusionAI/AReaL/pull_requests/66 Signed-off-by: 温差 <xushusheng.xss@antgroup.com> * . * . * . * . * . * update thpt fig * update readme 20250329-20:16 * update * update tutorial * . * upload 7B zero and 32B sft config * PullRequest: 72 change the condition of using etcd Merge branch fw/fix-etcd of git@code.alipay.com:inclusionAI/AReaL.git into main https://code.alipay.com/inclusionAI/AReaL/pull_requests/72 Signed-off-by: 晓雷 <meizhiyu.mzy@antgroup.com> * change the condition of using etcd * PullRequest: 60 Change the default SGLang parameters to avoid precision issues. Merge branch fw/fix-sglang of git@code.alipay.com:inclusionAI/AReaL.git into main https://code.alipay.com/inclusionAI/AReaL/pull_requests/60 Signed-off-by: 晓雷 <meizhiyu.mzy@antgroup.com> * change vllm config * . * . * PullRequest: 73 Fix a setup issue when using ETCD Merge branch fw/fix-etcd of git@code.alipay.com:inclusionAI/AReaL.git into main https://code.alipay.com/inclusionAI/AReaL/pull_requests/73 Signed-off-by: 晓雷 <meizhiyu.mzy@antgroup.com> * fix etcd * . * . * PullRequest: 75 Fix epoch counter before model function call execution. Merge branch fw/fix-epoch-counter of git@code.alipay.com:inclusionAI/AReaL.git into main https://code.alipay.com/inclusionAI/AReaL/pull_requests/75 Signed-off-by: 晓雷 <meizhiyu.mzy@antgroup.com> * . --------- Signed-off-by: 博惟 <bowei.fw@antgroup.com> Co-authored-by: wanghuaijie.whj <wanghuaijie.whj@antgroup.com> Co-authored-by: 博惟 <bowei.fw@antgroup.com> Co-authored-by: meijun <meijun.mei@antgroup.com>
2025-03-31 21:05:57 +08:00 · 2025-03-31 21:05:57 +08:00 · 1c33379c93
parent f4bd798ed9
commit 1c33379c93
8 changed files with 24 additions and 11 deletions
--- a/realhf/api/cli_args.py
+++ b/realhf/api/cli_args.py
@ -297,11 +297,13 @@ class SGLangConfig:
    # NOTE: to avoid the illegal memory access error
    attention_backend: Optional[str] = "triton"
    sampling_backend: Optional[str] = None
-    context_length: Optional[int] = None
-    mem_fraction_static: Optional[float] = None
+    context_length: Optional[int] = 32768
+    mem_fraction_static: Optional[float] = 0.9
    max_running_requests: Optional[int] = None
-    max_total_tokens: Optional[int] = None
-    chunked_prefill_size: Optional[int] = None
+    # NOTE: chunked_prefill_size is by default 8192 on GPUs with 80GB mem in SGLang,
+    # but we disable it to avoid precision issues
+    chunked_prefill_size: Optional[int] = -1
+    max_prefill_tokens: int = 32768
    max_prefill_tokens: int = 16384
    schedule_policy: str = "lpm"
    schedule_conservativeness: float = 1.0
--- a/realhf/apps/main.py
+++ b/realhf/apps/main.py
@ -161,7 +161,7 @@ def main_start(args, recover_count: int = 0):
        REAL_RECOVER_RUN="1" if is_recover_run else "0",
        REAL_SAVE_RECOVER_STATES="1" if save_recover_states else "0",
        FUNCTIONCALL_SERVICE_DOMAIN=os.getenv("FUNCTIONCALL_SERVICE_DOMAIN", ""),
-        REAL_ETCD_ADDR=os.getenv("REAL_ETCD_ADDR", "localhost:2379"),
+        REAL_ETCD_ADDR=os.getenv("REAL_ETCD_ADDR", ""),
    )
    for k, v in BASE_ENVIRONS.items():
        os.environ[k] = v
--- a/realhf/base/name_resolve.py
+++ b/realhf/base/name_resolve.py
@ -543,7 +543,10 @@ class Etcd3NameRecordRepository(NameRecordRepository):
    """

    # Default configuration
-    host, port = os.getenv("REAL_ETCD_ADDR", "localhost:2379").split(":")
+    try:
+        host, port = os.getenv("REAL_ETCD_ADDR", "").split(":")
+    except ValueError:
+        host, port = "localhost", 2379
    ETCD_HOST = host
    ETCD_PORT = int(port)
    ETCD_USER = None
@ -897,7 +900,11 @@ def make_repository(type_="nfs", **kwargs):

 # DEFAULT_REPOSITORY_TYPE = "redis" if socket.gethostname().startswith("frl") else "nfs"
 DEFAULT_REPOSITORY_TYPE = "nfs"
-if etcd3 is not None and cluster.spec.name in ["wa180"]:
+if (
+    etcd3 is not None
+    and cluster.spec.name in ["wa180", "na132", "su18"]
+    and os.getenv("REAL_ETCD_ADDR", "")
+):
    DEFAULT_REPOSITORY_TYPE = "etcd3"
 DEFAULT_REPOSITORY = make_repository(DEFAULT_REPOSITORY_TYPE)
 add = DEFAULT_REPOSITORY.add
--- a/realhf/impl/dataset/math_code_dataset.py
+++ b/realhf/impl/dataset/math_code_dataset.py
@ -128,6 +128,7 @@ class MATHCodePromptDataset(torch.utils.data.Dataset):
        self.ids = [
            str(self.ids[idx]) + f"@idx:{idx}-{util.dp_rank}" for idx in indices
        ]
+        self.tasks_ids = [self.tasks_ids[idx] for idx in indices]
        if "scores" in data[0]:
            self.base_scores = [self.base_scores[idx] for idx in indices]

--- a/realhf/impl/model/backend/sglang.py
+++ b/realhf/impl/model/backend/sglang.py
@ -415,7 +415,9 @@ class SGLangGenerationBackend(ModelBackend, SGLangConfig):
        ports = [None for _ in range(constants.data_parallel_world_size())]
        while any(port is None for port in ports) or len(set(ports)) != len(ports):
            dist.all_gather_object(
-                ports, network.find_free_port(), group=constants.data_parallel_group()
+                ports,
+                network.find_free_port(low=20000, high=40000),
+                group=constants.data_parallel_group(),
            )
        additional_args["port"] = ports[constants.data_parallel_rank()]

--- a/realhf/system/controller.py
+++ b/realhf/system/controller.py
@ -622,7 +622,7 @@ class RayController:
            REAL_DUMP_TRACE=os.environ.get("REAL_DUMP_TRACE", "0"),
            REAL_RECORD_PERFORMANCE=os.environ.get("REAL_RECORD_PERFORMANCE", "0"),
            REAL_DUMP_MEMORY=os.environ.get("REAL_DUMP_MEMORY", "0"),
-            REAL_ETCD_ADDR=os.getenv("REAL_ETCD_ADDR", "localhost:2379"),
+            REAL_ETCD_ADDR=os.getenv("REAL_ETCD_ADDR", ""),
        )
        runtime_env = {
            "env_vars": env_vars,
--- a/realhf/system/master_worker.py
+++ b/realhf/system/master_worker.py
@ -368,6 +368,9 @@ class MasterWorker(worker_base.Worker):
        epoch = self.__rpc_ctrl.step_info.epoch + 1
        epoch_step = self.__rpc_ctrl.step_info.epoch_step + 1
        global_step = self.__rpc_ctrl.step_info.global_step + 1
+        if is_new_epoch:
+            epoch += 1
+            epoch_step = 1
        s = f"The next step is epoch {epoch}/{self.config.exp_ctrl.total_train_epochs} "
        s += f"step {epoch_step}/{self._steps_per_epoch} "
        s += f"(global step {global_step}). "
--- a/realhf/system/worker_base.py
+++ b/realhf/system/worker_base.py
@ -20,7 +20,6 @@ logger = logging.getLogger("worker")

 _MAX_SOCKET_CONCURRENCY = 1000
 WORKER_WAIT_FOR_CONTROLLER_SECONDS = 3600
-WORKER_JOB_STATUS_LINGER_SECONDS = 1800


 class WorkerException(Exception):
@ -185,7 +184,6 @@ class WorkerServer:
                worker_name=self.__worker_name,
            ),
            value=status.value,
-            keepalive_ttl=WORKER_JOB_STATUS_LINGER_SECONDS,  # Job Status lives one minutes after worker exit.
            replace=True,
            delete_on_exit=False,
        )