mirror of https://github.com/inclusionAI/AReaL
Merge updates from ant repository. (#18)
* fix: `self.tasks_ids` should also be filtered * PullRequest: 67 Update v0.2.0 Dockerfile Merge branch fw/v0.2.0-dockerfile of git@code.alipay.com:inclusionAI/AReaL.git into main https://code.alipay.com/inclusionAI/AReaL/pull_requests/67 Signed-off-by: 温差 <xushusheng.xss@antgroup.com> * fw/v0.2.0-dockerfile * PullRequest: 66 Update v0.2.0 cover letter Merge branch fw/v0.2.0-readme of git@code.alipay.com:inclusionAI/AReaL.git into main https://code.alipay.com/inclusionAI/AReaL/pull_requests/66 Signed-off-by: 温差 <xushusheng.xss@antgroup.com> * . * . * . * . * . * update thpt fig * update readme 20250329-20:16 * update * update tutorial * . * upload 7B zero and 32B sft config * PullRequest: 72 change the condition of using etcd Merge branch fw/fix-etcd of git@code.alipay.com:inclusionAI/AReaL.git into main https://code.alipay.com/inclusionAI/AReaL/pull_requests/72 Signed-off-by: 晓雷 <meizhiyu.mzy@antgroup.com> * change the condition of using etcd * PullRequest: 60 Change the default SGLang parameters to avoid precision issues. Merge branch fw/fix-sglang of git@code.alipay.com:inclusionAI/AReaL.git into main https://code.alipay.com/inclusionAI/AReaL/pull_requests/60 Signed-off-by: 晓雷 <meizhiyu.mzy@antgroup.com> * change vllm config * . * . * PullRequest: 73 Fix a setup issue when using ETCD Merge branch fw/fix-etcd of git@code.alipay.com:inclusionAI/AReaL.git into main https://code.alipay.com/inclusionAI/AReaL/pull_requests/73 Signed-off-by: 晓雷 <meizhiyu.mzy@antgroup.com> * fix etcd * . * . * PullRequest: 75 Fix epoch counter before model function call execution. Merge branch fw/fix-epoch-counter of git@code.alipay.com:inclusionAI/AReaL.git into main https://code.alipay.com/inclusionAI/AReaL/pull_requests/75 Signed-off-by: 晓雷 <meizhiyu.mzy@antgroup.com> * . --------- Signed-off-by: 博惟 <bowei.fw@antgroup.com> Co-authored-by: wanghuaijie.whj <wanghuaijie.whj@antgroup.com> Co-authored-by: 博惟 <bowei.fw@antgroup.com> Co-authored-by: meijun <meijun.mei@antgroup.com>
This commit is contained in:
parent
f4bd798ed9
commit
1c33379c93
|
@ -297,11 +297,13 @@ class SGLangConfig:
|
|||
# NOTE: to avoid the illegal memory access error
|
||||
attention_backend: Optional[str] = "triton"
|
||||
sampling_backend: Optional[str] = None
|
||||
context_length: Optional[int] = None
|
||||
mem_fraction_static: Optional[float] = None
|
||||
context_length: Optional[int] = 32768
|
||||
mem_fraction_static: Optional[float] = 0.9
|
||||
max_running_requests: Optional[int] = None
|
||||
max_total_tokens: Optional[int] = None
|
||||
chunked_prefill_size: Optional[int] = None
|
||||
# NOTE: chunked_prefill_size is by default 8192 on GPUs with 80GB mem in SGLang,
|
||||
# but we disable it to avoid precision issues
|
||||
chunked_prefill_size: Optional[int] = -1
|
||||
max_prefill_tokens: int = 32768
|
||||
max_prefill_tokens: int = 16384
|
||||
schedule_policy: str = "lpm"
|
||||
schedule_conservativeness: float = 1.0
|
||||
|
|
|
@ -161,7 +161,7 @@ def main_start(args, recover_count: int = 0):
|
|||
REAL_RECOVER_RUN="1" if is_recover_run else "0",
|
||||
REAL_SAVE_RECOVER_STATES="1" if save_recover_states else "0",
|
||||
FUNCTIONCALL_SERVICE_DOMAIN=os.getenv("FUNCTIONCALL_SERVICE_DOMAIN", ""),
|
||||
REAL_ETCD_ADDR=os.getenv("REAL_ETCD_ADDR", "localhost:2379"),
|
||||
REAL_ETCD_ADDR=os.getenv("REAL_ETCD_ADDR", ""),
|
||||
)
|
||||
for k, v in BASE_ENVIRONS.items():
|
||||
os.environ[k] = v
|
||||
|
|
|
@ -543,7 +543,10 @@ class Etcd3NameRecordRepository(NameRecordRepository):
|
|||
"""
|
||||
|
||||
# Default configuration
|
||||
host, port = os.getenv("REAL_ETCD_ADDR", "localhost:2379").split(":")
|
||||
try:
|
||||
host, port = os.getenv("REAL_ETCD_ADDR", "").split(":")
|
||||
except ValueError:
|
||||
host, port = "localhost", 2379
|
||||
ETCD_HOST = host
|
||||
ETCD_PORT = int(port)
|
||||
ETCD_USER = None
|
||||
|
@ -897,7 +900,11 @@ def make_repository(type_="nfs", **kwargs):
|
|||
|
||||
# DEFAULT_REPOSITORY_TYPE = "redis" if socket.gethostname().startswith("frl") else "nfs"
|
||||
DEFAULT_REPOSITORY_TYPE = "nfs"
|
||||
if etcd3 is not None and cluster.spec.name in ["wa180"]:
|
||||
if (
|
||||
etcd3 is not None
|
||||
and cluster.spec.name in ["wa180", "na132", "su18"]
|
||||
and os.getenv("REAL_ETCD_ADDR", "")
|
||||
):
|
||||
DEFAULT_REPOSITORY_TYPE = "etcd3"
|
||||
DEFAULT_REPOSITORY = make_repository(DEFAULT_REPOSITORY_TYPE)
|
||||
add = DEFAULT_REPOSITORY.add
|
||||
|
|
|
@ -128,6 +128,7 @@ class MATHCodePromptDataset(torch.utils.data.Dataset):
|
|||
self.ids = [
|
||||
str(self.ids[idx]) + f"@idx:{idx}-{util.dp_rank}" for idx in indices
|
||||
]
|
||||
self.tasks_ids = [self.tasks_ids[idx] for idx in indices]
|
||||
if "scores" in data[0]:
|
||||
self.base_scores = [self.base_scores[idx] for idx in indices]
|
||||
|
||||
|
|
|
@ -415,7 +415,9 @@ class SGLangGenerationBackend(ModelBackend, SGLangConfig):
|
|||
ports = [None for _ in range(constants.data_parallel_world_size())]
|
||||
while any(port is None for port in ports) or len(set(ports)) != len(ports):
|
||||
dist.all_gather_object(
|
||||
ports, network.find_free_port(), group=constants.data_parallel_group()
|
||||
ports,
|
||||
network.find_free_port(low=20000, high=40000),
|
||||
group=constants.data_parallel_group(),
|
||||
)
|
||||
additional_args["port"] = ports[constants.data_parallel_rank()]
|
||||
|
||||
|
|
|
@ -622,7 +622,7 @@ class RayController:
|
|||
REAL_DUMP_TRACE=os.environ.get("REAL_DUMP_TRACE", "0"),
|
||||
REAL_RECORD_PERFORMANCE=os.environ.get("REAL_RECORD_PERFORMANCE", "0"),
|
||||
REAL_DUMP_MEMORY=os.environ.get("REAL_DUMP_MEMORY", "0"),
|
||||
REAL_ETCD_ADDR=os.getenv("REAL_ETCD_ADDR", "localhost:2379"),
|
||||
REAL_ETCD_ADDR=os.getenv("REAL_ETCD_ADDR", ""),
|
||||
)
|
||||
runtime_env = {
|
||||
"env_vars": env_vars,
|
||||
|
|
|
@ -368,6 +368,9 @@ class MasterWorker(worker_base.Worker):
|
|||
epoch = self.__rpc_ctrl.step_info.epoch + 1
|
||||
epoch_step = self.__rpc_ctrl.step_info.epoch_step + 1
|
||||
global_step = self.__rpc_ctrl.step_info.global_step + 1
|
||||
if is_new_epoch:
|
||||
epoch += 1
|
||||
epoch_step = 1
|
||||
s = f"The next step is epoch {epoch}/{self.config.exp_ctrl.total_train_epochs} "
|
||||
s += f"step {epoch_step}/{self._steps_per_epoch} "
|
||||
s += f"(global step {global_step}). "
|
||||
|
|
|
@ -20,7 +20,6 @@ logger = logging.getLogger("worker")
|
|||
|
||||
_MAX_SOCKET_CONCURRENCY = 1000
|
||||
WORKER_WAIT_FOR_CONTROLLER_SECONDS = 3600
|
||||
WORKER_JOB_STATUS_LINGER_SECONDS = 1800
|
||||
|
||||
|
||||
class WorkerException(Exception):
|
||||
|
@ -185,7 +184,6 @@ class WorkerServer:
|
|||
worker_name=self.__worker_name,
|
||||
),
|
||||
value=status.value,
|
||||
keepalive_ttl=WORKER_JOB_STATUS_LINGER_SECONDS, # Job Status lives one minutes after worker exit.
|
||||
replace=True,
|
||||
delete_on_exit=False,
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue