Compare commits

...

3 Commits

Author SHA1 Message Date
jlamanna f53f0485e6
Merge b43170fd73 into 0d1ece2b43 2025-07-19 02:11:14 +08:00
Stephen Sachs 0d1ece2b43 Exclude ongoing issues from auto-closing logic
- Added a check to skip issues labeled "ongoing" in the close-old-issues script
- Adjusted the condition to compare both creation and update dates against six months ago
2025-07-17 21:50:05 +02:00
James Lamanna b43170fd73 Fix for sparse GID tables (not starting at one)
This fix is for supporting sparse GID tables. Especially in Kubernetes
environments, a pod can end up with a GID table that does not start at
"1". This arises when there is another pod on the host that has assigned
an IP address to the NIC (usually when they are exposed through MACVLAN
to multiple pods).

An example:

```
$ show_gids
DEV     PORT    INDEX   GID                                     IPv4            VER     DEV
---     ----    -----   ---                                     ------------    ---     ---
mlx5_0  1       6       0000:0000:0000:0000:0000:ffff:0b00:0401 11.0.4.1        v1      net1
mlx5_0  1       7       0000:0000:0000:0000:0000:ffff:0b00:0401 11.0.4.1        v2      net1
mlx5_0  1       8       fe80:0000:0000:0000:e4ea:71ff:feb9:4970                 v1      net1
mlx5_0  1       9       fe80:0000:0000:0000:e4ea:71ff:feb9:4970                 v2      net1
```

This patch fixes the existing code which has two problems for dealing
with this:

1) The GID index loop can terminate prematurely because of an error
   being thrown in searching for the GID by testing an invalid GID
2) A non-valid GID could be selected (by just comparing address
   families), and then on subsequent tests, the ROCE call would fail,
   resulting in a valid GID being skipped.
2025-01-22 13:53:43 -08:00
2 changed files with 13 additions and 8 deletions

View File

@ -38,10 +38,13 @@ Thanks for your understanding and for contributing to NCCL.`;
// Ignore PRs
if (issue.pull_request) continue;
// Ignore issues with label "ongoing"
if (issue.labels.some(label => label.name === "ongoing")) continue;
const createdAt = new Date(issue.created_at);
const updatedAt = new Date(issue.updated_at);
if (createdAt < sixMonthsAgo && updatedAt < oneMonthAgo) {
if (createdAt < sixMonthsAgo && updatedAt < sixMonthsAgo) {
// Add a comment before closing
await octokit.issues.createComment({

View File

@ -382,15 +382,13 @@ static ncclResult_t ncclUpdateGidIndex(struct ibv_context* context, uint8_t port
bool gidCandidateMatchSubnet = matchGidAddrPrefix(usrFam, prefix, prefixlen, &gidCandidate);
if (gidCandidateFam != gidFam && gidCandidateFam == usrFam && gidCandidateMatchSubnet) {
*gidIndex = gidIndexCandidate;
} else {
if (gidCandidateFam != usrFam || !validGid(&gidCandidate) || !gidCandidateMatchSubnet) {
return ncclSuccess;
}
int usrRoceVer = roceVer;
int gidRoceVerNum, gidRoceVerNumCandidate = -1;
const char* deviceName = wrap_ibv_get_device_name(context->device);
NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, *gidIndex, &gidRoceVerNum));
if (validGid(&gid)) {
NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, *gidIndex, &gidRoceVerNum));
}
NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, gidIndexCandidate, &gidRoceVerNumCandidate));
if ((gidRoceVerNum != gidRoceVerNumCandidate || !validGid(&gid)) && gidRoceVerNumCandidate == usrRoceVer) {
*gidIndex = gidIndexCandidate;
@ -444,9 +442,13 @@ static ncclResult_t ncclIbGetGidIndex(struct ibv_context *context, uint8_t portN
*gidIndex = 0;
for (int gidIndexNext = 1; gidIndexNext < gidTblLen; ++gidIndexNext) {
NCCLCHECK(ncclUpdateGidIndex(context, portNum, userAddrFamily, prefix, prefixlen, userRoceVersion, gidIndexNext, gidIndex));
// It is ok for this to return non-success. GID assignment is fully handled in the function
// We do not want to short-circuit this loop prematurely in the case of a GID table not starting at 1
ncclUpdateGidIndex(context, portNum, userAddrFamily, prefix, prefixlen, userRoceVersion, gidIndexNext, gidIndex);
}
INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using GID Index %d", *gidIndex);
return ncclSuccess;
}