This commit is contained in:
jlamanna 2025-07-19 02:11:14 +08:00 committed by GitHub
commit f53f0485e6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 9 additions and 7 deletions

View File

@ -382,15 +382,13 @@ static ncclResult_t ncclUpdateGidIndex(struct ibv_context* context, uint8_t port
bool gidCandidateMatchSubnet = matchGidAddrPrefix(usrFam, prefix, prefixlen, &gidCandidate);
if (gidCandidateFam != gidFam && gidCandidateFam == usrFam && gidCandidateMatchSubnet) {
*gidIndex = gidIndexCandidate;
} else {
if (gidCandidateFam != usrFam || !validGid(&gidCandidate) || !gidCandidateMatchSubnet) {
return ncclSuccess;
}
int usrRoceVer = roceVer;
int gidRoceVerNum, gidRoceVerNumCandidate = -1;
const char* deviceName = wrap_ibv_get_device_name(context->device);
NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, *gidIndex, &gidRoceVerNum));
if (validGid(&gid)) {
NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, *gidIndex, &gidRoceVerNum));
}
NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, gidIndexCandidate, &gidRoceVerNumCandidate));
if ((gidRoceVerNum != gidRoceVerNumCandidate || !validGid(&gid)) && gidRoceVerNumCandidate == usrRoceVer) {
*gidIndex = gidIndexCandidate;
@ -444,9 +442,13 @@ static ncclResult_t ncclIbGetGidIndex(struct ibv_context *context, uint8_t portN
*gidIndex = 0;
for (int gidIndexNext = 1; gidIndexNext < gidTblLen; ++gidIndexNext) {
NCCLCHECK(ncclUpdateGidIndex(context, portNum, userAddrFamily, prefix, prefixlen, userRoceVersion, gidIndexNext, gidIndex));
// It is ok for this to return non-success. GID assignment is fully handled in the function
// We do not want to short-circuit this loop prematurely in the case of a GID table not starting at 1
ncclUpdateGidIndex(context, portNum, userAddrFamily, prefix, prefixlen, userRoceVersion, gidIndexNext, gidIndex);
}
INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using GID Index %d", *gidIndex);
return ncclSuccess;
}