Merge updates from ant repository. (#18)

* fix: `self.tasks_ids` should also be filtered

* PullRequest: 67 Update v0.2.0 Dockerfile

Merge branch fw/v0.2.0-dockerfile of git@code.alipay.com:inclusionAI/AReaL.git into main
https://code.alipay.com/inclusionAI/AReaL/pull_requests/67

Signed-off-by: 温差 <xushusheng.xss@antgroup.com>


* fw/v0.2.0-dockerfile

* PullRequest: 66 Update v0.2.0 cover letter

Merge branch fw/v0.2.0-readme of git@code.alipay.com:inclusionAI/AReaL.git into main
https://code.alipay.com/inclusionAI/AReaL/pull_requests/66

Signed-off-by: 温差 <xushusheng.xss@antgroup.com>


* .
* .
* .
* .
* .
* update thpt fig
* update readme 20250329-20:16
* update
* update tutorial
* .

* upload 7B zero and 32B sft config

* PullRequest: 72 change the condition of using etcd

Merge branch fw/fix-etcd of git@code.alipay.com:inclusionAI/AReaL.git into main
https://code.alipay.com/inclusionAI/AReaL/pull_requests/72

Signed-off-by: 晓雷 <meizhiyu.mzy@antgroup.com>


* change the condition of using etcd

* PullRequest: 60 Change the default SGLang parameters to avoid precision issues.

Merge branch fw/fix-sglang of git@code.alipay.com:inclusionAI/AReaL.git into main
https://code.alipay.com/inclusionAI/AReaL/pull_requests/60

Signed-off-by: 晓雷 <meizhiyu.mzy@antgroup.com>


* change vllm config
* .
* .

* PullRequest: 73 Fix a setup issue when using ETCD

Merge branch fw/fix-etcd of git@code.alipay.com:inclusionAI/AReaL.git into main
https://code.alipay.com/inclusionAI/AReaL/pull_requests/73

Signed-off-by: 晓雷 <meizhiyu.mzy@antgroup.com>


* fix etcd
* .
* .

* PullRequest: 75 Fix epoch counter before model function call execution.

Merge branch fw/fix-epoch-counter of git@code.alipay.com:inclusionAI/AReaL.git into main
https://code.alipay.com/inclusionAI/AReaL/pull_requests/75

Signed-off-by: 晓雷 <meizhiyu.mzy@antgroup.com>


* .

---------

Signed-off-by: 博惟 <bowei.fw@antgroup.com>
Co-authored-by: wanghuaijie.whj <wanghuaijie.whj@antgroup.com>
Co-authored-by: 博惟 <bowei.fw@antgroup.com>
Co-authored-by: meijun <meijun.mei@antgroup.com>
This commit is contained in:
nuzant 2025-03-31 21:05:57 +08:00 committed by GitHub
parent f4bd798ed9
commit 1c33379c93
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 24 additions and 11 deletions

View File

@ -297,11 +297,13 @@ class SGLangConfig:
# NOTE: to avoid the illegal memory access error
attention_backend: Optional[str] = "triton"
sampling_backend: Optional[str] = None
context_length: Optional[int] = None
mem_fraction_static: Optional[float] = None
context_length: Optional[int] = 32768
mem_fraction_static: Optional[float] = 0.9
max_running_requests: Optional[int] = None
max_total_tokens: Optional[int] = None
chunked_prefill_size: Optional[int] = None
# NOTE: chunked_prefill_size is by default 8192 on GPUs with 80GB mem in SGLang,
# but we disable it to avoid precision issues
chunked_prefill_size: Optional[int] = -1
max_prefill_tokens: int = 32768
max_prefill_tokens: int = 16384
schedule_policy: str = "lpm"
schedule_conservativeness: float = 1.0

View File

@ -161,7 +161,7 @@ def main_start(args, recover_count: int = 0):
REAL_RECOVER_RUN="1" if is_recover_run else "0",
REAL_SAVE_RECOVER_STATES="1" if save_recover_states else "0",
FUNCTIONCALL_SERVICE_DOMAIN=os.getenv("FUNCTIONCALL_SERVICE_DOMAIN", ""),
REAL_ETCD_ADDR=os.getenv("REAL_ETCD_ADDR", "localhost:2379"),
REAL_ETCD_ADDR=os.getenv("REAL_ETCD_ADDR", ""),
)
for k, v in BASE_ENVIRONS.items():
os.environ[k] = v

View File

@ -543,7 +543,10 @@ class Etcd3NameRecordRepository(NameRecordRepository):
"""
# Default configuration
host, port = os.getenv("REAL_ETCD_ADDR", "localhost:2379").split(":")
try:
host, port = os.getenv("REAL_ETCD_ADDR", "").split(":")
except ValueError:
host, port = "localhost", 2379
ETCD_HOST = host
ETCD_PORT = int(port)
ETCD_USER = None
@ -897,7 +900,11 @@ def make_repository(type_="nfs", **kwargs):
# DEFAULT_REPOSITORY_TYPE = "redis" if socket.gethostname().startswith("frl") else "nfs"
DEFAULT_REPOSITORY_TYPE = "nfs"
if etcd3 is not None and cluster.spec.name in ["wa180"]:
if (
etcd3 is not None
and cluster.spec.name in ["wa180", "na132", "su18"]
and os.getenv("REAL_ETCD_ADDR", "")
):
DEFAULT_REPOSITORY_TYPE = "etcd3"
DEFAULT_REPOSITORY = make_repository(DEFAULT_REPOSITORY_TYPE)
add = DEFAULT_REPOSITORY.add

View File

@ -128,6 +128,7 @@ class MATHCodePromptDataset(torch.utils.data.Dataset):
self.ids = [
str(self.ids[idx]) + f"@idx:{idx}-{util.dp_rank}" for idx in indices
]
self.tasks_ids = [self.tasks_ids[idx] for idx in indices]
if "scores" in data[0]:
self.base_scores = [self.base_scores[idx] for idx in indices]

View File

@ -415,7 +415,9 @@ class SGLangGenerationBackend(ModelBackend, SGLangConfig):
ports = [None for _ in range(constants.data_parallel_world_size())]
while any(port is None for port in ports) or len(set(ports)) != len(ports):
dist.all_gather_object(
ports, network.find_free_port(), group=constants.data_parallel_group()
ports,
network.find_free_port(low=20000, high=40000),
group=constants.data_parallel_group(),
)
additional_args["port"] = ports[constants.data_parallel_rank()]

View File

@ -622,7 +622,7 @@ class RayController:
REAL_DUMP_TRACE=os.environ.get("REAL_DUMP_TRACE", "0"),
REAL_RECORD_PERFORMANCE=os.environ.get("REAL_RECORD_PERFORMANCE", "0"),
REAL_DUMP_MEMORY=os.environ.get("REAL_DUMP_MEMORY", "0"),
REAL_ETCD_ADDR=os.getenv("REAL_ETCD_ADDR", "localhost:2379"),
REAL_ETCD_ADDR=os.getenv("REAL_ETCD_ADDR", ""),
)
runtime_env = {
"env_vars": env_vars,

View File

@ -368,6 +368,9 @@ class MasterWorker(worker_base.Worker):
epoch = self.__rpc_ctrl.step_info.epoch + 1
epoch_step = self.__rpc_ctrl.step_info.epoch_step + 1
global_step = self.__rpc_ctrl.step_info.global_step + 1
if is_new_epoch:
epoch += 1
epoch_step = 1
s = f"The next step is epoch {epoch}/{self.config.exp_ctrl.total_train_epochs} "
s += f"step {epoch_step}/{self._steps_per_epoch} "
s += f"(global step {global_step}). "

View File

@ -20,7 +20,6 @@ logger = logging.getLogger("worker")
_MAX_SOCKET_CONCURRENCY = 1000
WORKER_WAIT_FOR_CONTROLLER_SECONDS = 3600
WORKER_JOB_STATUS_LINGER_SECONDS = 1800
class WorkerException(Exception):
@ -185,7 +184,6 @@ class WorkerServer:
worker_name=self.__worker_name,
),
value=status.value,
keepalive_ttl=WORKER_JOB_STATUS_LINGER_SECONDS, # Job Status lives one minutes after worker exit.
replace=True,
delete_on_exit=False,
)