fix timeout for NCCL

This commit is contained in:
kira.gw 2025-03-28 20:11:19 +08:00
parent f86b78c1c4
commit b7e90fbd48
1 changed files with 3 additions and 4 deletions

View File

@ -59,10 +59,9 @@ class GlobalMemoryBuffer:
return res
# 30 minutes. Transferring super-large batches via NCCL bcast
# for the first time may consumer over 600 secs, which is the
# pytorch's default. Increase this value to 30 minutes.
NCCL_DEFAULT_TIMEOUT = datetime.timedelta(seconds=1800)
# For large models, generation may consume more than 3600s.
# We set a large value to avoid NCCL timeout issues during generaiton.
NCCL_DEFAULT_TIMEOUT = datetime.timedelta(seconds=7200)
# We may want to use CPU for testing even when CUDA is available.
TORCH_FORCE_CPU = False