mirror of https://github.com/NVIDIA/nccl.git
319 lines
14 KiB
C++
319 lines
14 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#include "ibvwrap.h"
|
|
#include <sys/types.h>
|
|
#include <unistd.h>
|
|
|
|
#ifdef NCCL_BUILD_RDMA_CORE
|
|
#include <infiniband/verbs.h>
|
|
#else
|
|
#include "ibvcore.h"
|
|
#endif
|
|
#include "ibvsymbols.h"
|
|
|
|
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
|
|
static ncclResult_t initResult;
|
|
struct ncclIbvSymbols ibvSymbols;
|
|
|
|
ncclResult_t wrap_ibv_symbols(void) {
|
|
pthread_once(&initOnceControl,
|
|
[](){ initResult = buildIbvSymbols(&ibvSymbols); });
|
|
return initResult;
|
|
}
|
|
|
|
/* CHECK_NOT_NULL: helper macro to check for NULL symbol */
|
|
#define CHECK_NOT_NULL(container, internal_name) \
|
|
if (container.internal_name == NULL) { \
|
|
WARN("lib wrapper not initialized."); \
|
|
return ncclInternalError; \
|
|
}
|
|
|
|
#define IBV_PTR_CHECK_ERRNO(container, internal_name, call, retval, error_retval, name) \
|
|
CHECK_NOT_NULL(container, internal_name); \
|
|
retval = container.call; \
|
|
if (retval == error_retval) { \
|
|
WARN("Call to " name " failed with error %s", strerror(errno)); \
|
|
return ncclSystemError; \
|
|
} \
|
|
return ncclSuccess;
|
|
|
|
#define IBV_PTR_CHECK(container, internal_name, call, retval, error_retval, name) \
|
|
CHECK_NOT_NULL(container, internal_name); \
|
|
retval = container.call; \
|
|
if (retval == error_retval) { \
|
|
WARN("Call to " name " failed"); \
|
|
return ncclSystemError; \
|
|
} \
|
|
return ncclSuccess;
|
|
|
|
#define IBV_INT_CHECK_RET_ERRNO_OPTIONAL(container, internal_name, call, success_retval, name, supported) \
|
|
if (container.internal_name == NULL) { \
|
|
INFO(NCCL_NET, "Call to " name " skipped, internal_name doesn't exist"); \
|
|
*supported = 0; \
|
|
return ncclSuccess; \
|
|
} \
|
|
int ret = container.call; \
|
|
if (ret == ENOTSUP || ret == EOPNOTSUPP) { \
|
|
INFO(NCCL_NET, "Call to " name " not supported"); \
|
|
*supported = 0; \
|
|
return ncclSuccess; \
|
|
} else if (ret != success_retval) { \
|
|
WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \
|
|
*supported = 1; \
|
|
return ncclSystemError; \
|
|
} \
|
|
*supported = 1; \
|
|
return ncclSuccess;
|
|
|
|
#define IBV_INT_CHECK_RET_ERRNO(container, internal_name, call, success_retval, name) \
|
|
CHECK_NOT_NULL(container, internal_name); \
|
|
int ret = container.call; \
|
|
if (ret != success_retval) { \
|
|
WARN("Call to " name " failed with error %s errno %d", strerror(ret), ret); \
|
|
return ncclSystemError; \
|
|
} \
|
|
return ncclSuccess;
|
|
|
|
#define IBV_INT_CHECK(container, internal_name, call, error_retval, name) \
|
|
CHECK_NOT_NULL(container, internal_name); \
|
|
int ret = container.call; \
|
|
if (ret == error_retval) { \
|
|
WARN("Call to " name " failed"); \
|
|
return ncclSystemError; \
|
|
} \
|
|
return ncclSuccess;
|
|
|
|
#define IBV_PASSTHRU(container, internal_name, call) \
|
|
CHECK_NOT_NULL(container, internal_name); \
|
|
container.call; \
|
|
return ncclSuccess;
|
|
|
|
NCCL_PARAM(IbMQpRetryAll, "IB_MQP_RETRY_ALL", 0);
|
|
NCCL_PARAM(IbMQpRetryCnt, "IB_MQP_RETRY_CNT", 34);
|
|
NCCL_PARAM(IbMQpRetryTimeout, "IB_MQP_RETRY_SLEEP_MSEC", 100); // in milliseconds
|
|
|
|
#define IBV_ERR_EQ(e, code) (e == code || e == (-code))
|
|
#define IBV_MQP_RETRY_ERRNO(e) (IBV_ERR_EQ(e, ETIMEDOUT))
|
|
#define IBV_MQP_RETRY_ERRNO_ALL(e) (ncclParamIbMQpRetryAll() ? (e != 0) : IBV_MQP_RETRY_ERRNO(e))
|
|
|
|
ncclResult_t wrap_ibv_fork_init() {
|
|
IBV_INT_CHECK(ibvSymbols, ibv_internal_fork_init, ibv_internal_fork_init(), -1, "ibv_fork_init");
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices) {
|
|
*ret = ibvSymbols.ibv_internal_get_device_list(num_devices);
|
|
if (*ret == NULL) *num_devices = 0;
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list) {
|
|
IBV_PASSTHRU(ibvSymbols, ibv_internal_free_device_list, ibv_internal_free_device_list(list));
|
|
}
|
|
|
|
const char *wrap_ibv_get_device_name(struct ibv_device *device) {
|
|
if (ibvSymbols.ibv_internal_get_device_name == NULL) {
|
|
WARN("lib wrapper not initialized.");
|
|
exit(-1);
|
|
}
|
|
return ibvSymbols.ibv_internal_get_device_name(device);
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device) { /*returns 0 on success, -1 on failure*/
|
|
IBV_PTR_CHECK(ibvSymbols, ibv_internal_open_device, ibv_internal_open_device(device), *ret, NULL, "ibv_open_device");
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_close_device(struct ibv_context *context) { /*returns 0 on success, -1 on failure*/
|
|
IBV_INT_CHECK(ibvSymbols, ibv_internal_close_device, ibv_internal_close_device(context), -1, "ibv_close_device");
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event) { /*returns 0 on success, and -1 on error*/
|
|
IBV_INT_CHECK(ibvSymbols, ibv_internal_get_async_event, ibv_internal_get_async_event(context, event), -1, "ibv_get_async_event");
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event) {
|
|
IBV_PASSTHRU(ibvSymbols, ibv_internal_ack_async_event, ibv_internal_ack_async_event(event));
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
|
|
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_device, ibv_internal_query_device(context, device_attr), 0, "ibv_query_device");
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) {
|
|
// First try and query the extended port attributes (e.g. active_speed_ex)
|
|
if (ibv_query_port_ex(context, port_num, port_attr) != 0) {
|
|
// Fall back to the original attribute API call, but zero all members first
|
|
memset(port_attr, 0, sizeof(*port_attr));
|
|
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port");
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid) {
|
|
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_gid, ibv_internal_query_gid(context, port_num, index, gid), 0, "ibv_query_gid");
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) {
|
|
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_qp, ibv_internal_query_qp(qp, attr, attr_mask, init_attr), 0, "ibv_query_qp");
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context) {
|
|
IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd");
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
|
|
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_dealloc_pd, ibv_internal_dealloc_pd(pd), 0, "ibv_dealloc_pd");
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access) {
|
|
IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
|
|
}
|
|
|
|
struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) {
|
|
if (ibvSymbols.ibv_internal_reg_mr == NULL) {
|
|
WARN("lib wrapper not initialized.");
|
|
return NULL;
|
|
}
|
|
return ibvSymbols.ibv_internal_reg_mr(pd, addr, length, access);
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access) {
|
|
if (ibvSymbols.ibv_internal_reg_mr_iova2 == NULL) {
|
|
return ncclInternalError;
|
|
}
|
|
if (ret == NULL) { return ncclSuccess; } // Assume dummy call
|
|
IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
|
|
}
|
|
|
|
/* DMA-BUF support */
|
|
ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
|
|
IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_reg_dmabuf_mr, ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access), *ret, NULL, "ibv_reg_dmabuf_mr");
|
|
}
|
|
|
|
struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access) {
|
|
if (ibvSymbols.ibv_internal_reg_dmabuf_mr == NULL) {
|
|
errno = EOPNOTSUPP; // ncclIbDmaBufSupport() requires this errno being set
|
|
return NULL;
|
|
}
|
|
return ibvSymbols.ibv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access);
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
|
|
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_dereg_mr, ibv_internal_dereg_mr(mr), 0, "ibv_dereg_mr");
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector) {
|
|
IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq");
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq) {
|
|
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_destroy_cq, ibv_internal_destroy_cq(cq), 0, "ibv_destroy_cq");
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp) {
|
|
IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_destroy_qp, ibv_internal_destroy_qp(qp), 0, "ibv_destroy_qp");
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) {
|
|
IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp");
|
|
}
|
|
|
|
static void ibvQpStateName(enum ibv_qp_state state, char* msg, const size_t len) {
|
|
switch (state) {
|
|
case (IBV_QPS_RESET): snprintf(msg, len, "RESET"); break;
|
|
case (IBV_QPS_INIT): snprintf(msg, len, "INIT"); break;
|
|
case (IBV_QPS_RTR): snprintf(msg, len, "RTR"); break;
|
|
case (IBV_QPS_RTS): snprintf(msg, len, "RTS"); break;
|
|
case (IBV_QPS_SQD): snprintf(msg, len, "SQD"); break;
|
|
case (IBV_QPS_SQE): snprintf(msg, len, "SQE"); break;
|
|
case (IBV_QPS_ERR): snprintf(msg, len, "ERR"); break;
|
|
case (IBV_QPS_UNKNOWN): snprintf(msg, len, "UNKNOWN"); break;
|
|
default: snprintf(msg, len, "NOT RECOGNIZED (%d)", state); break;
|
|
}
|
|
}
|
|
|
|
#define QP_ATTR(attr, userAttr, userFlag, mask) ((userFlag & mask) ? (userAttr) : (attr))
|
|
|
|
static void ibvModifyQpLog(struct ibv_qp* qp, enum ibv_qp_state qpState, struct ibv_qp_attr* userAttr, int userFlag, char* msg, size_t msgLen) {
|
|
ncclResult_t res;
|
|
int portNum = -1, gidIndex = -1;
|
|
char localGidName[INET6_ADDRSTRLEN], remoteGidName[INET6_ADDRSTRLEN];
|
|
const char *localGidRes = NULL, *remoteGidRes = NULL;
|
|
|
|
char nextState[32], currState[32];
|
|
ibvQpStateName(qp->state, currState, sizeof(currState));
|
|
ibvQpStateName(qpState, nextState, sizeof(nextState));
|
|
char devName[IBV_SYSFS_NAME_MAX] = "";
|
|
snprintf(devName, sizeof(devName), "%s", (qp->pd->context) ? wrap_ibv_get_device_name(qp->pd->context->device) : "N/A");
|
|
|
|
struct ibv_qp_attr attr;
|
|
struct ibv_qp_init_attr init_attr;
|
|
int attr_mask = IBV_QP_PORT | IBV_QP_AV;
|
|
res = wrap_ibv_query_qp(qp, &attr, attr_mask, &init_attr);
|
|
struct ibv_qp_attr *qpAttr = (res == ncclSuccess) ? &attr : NULL;
|
|
|
|
// port info, portAttr can be NULL if not given by the user and query_qp failed
|
|
struct ibv_qp_attr *portAttr = QP_ATTR(qpAttr, userAttr, userFlag, IBV_QP_PORT);
|
|
portNum = portAttr ? portAttr->port_num : -1;
|
|
|
|
// address info, avAttr can be NULL if not given by the user and query_qp failed
|
|
struct ibv_qp_attr *avAttr = QP_ATTR(qpAttr, userAttr, userFlag, IBV_QP_AV);
|
|
if (avAttr && avAttr->ah_attr.is_global) {
|
|
union ibv_gid *remoteGid = &avAttr->ah_attr.grh.dgid;
|
|
remoteGidRes = ibvGetGidStr(remoteGid, remoteGidName, sizeof(remoteGidName));
|
|
// we need pd->context to retrieve local GID, skip if not there
|
|
if (!qp->pd->context) goto print;
|
|
gidIndex = avAttr->ah_attr.grh.sgid_index;
|
|
union ibv_gid localGid;
|
|
NCCLCHECKGOTO(wrap_ibv_query_gid(qp->pd->context, portNum, gidIndex, &localGid), res, print);
|
|
localGidRes = ibvGetGidStr(&localGid, localGidName, sizeof(localGidName));
|
|
}
|
|
|
|
print:
|
|
snprintf(msg, msgLen, "on dev %s:%d, curr state %s, next state %s, local GID index %d, local GID %s, remote GID %s",
|
|
devName, portNum, currState, nextState, gidIndex, localGidRes ? localGidName : "N/A", remoteGidRes ? remoteGidName : "N/A");
|
|
return;
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_modify_qp(struct ibv_qp* qp, struct ibv_qp_attr* attr, int attr_mask) {
|
|
char qpMsg[1024];
|
|
int ret = 0, attempts = 0;
|
|
int maxCnt = (int)ncclParamIbMQpRetryCnt() + 1; // number of attempts = number of retry + 1
|
|
int timeOut = (int)ncclParamIbMQpRetryTimeout();
|
|
CHECK_NOT_NULL(ibvSymbols, ibv_internal_modify_qp);
|
|
do {
|
|
if (attempts > 0) {
|
|
unsigned int sleepTime = timeOut * attempts;
|
|
ibvModifyQpLog(qp, attr->qp_state, attr, attr_mask, qpMsg, sizeof(qpMsg));
|
|
INFO(NCCL_NET, "Call to ibv_modify_qp failed with %d %s, %s, retrying %d/%d after %u msec of sleep", ret, strerror(ret), qpMsg, attempts, maxCnt, sleepTime);
|
|
// sleep before retrying
|
|
struct timespec tv = {.tv_sec = sleepTime / 1000, .tv_nsec = (sleepTime % 1000) * ((long)1e6)};
|
|
nanosleep(&tv, NULL);
|
|
}
|
|
ret = ibvSymbols.ibv_internal_modify_qp(qp, attr, attr_mask);
|
|
attempts++;
|
|
} while (IBV_MQP_RETRY_ERRNO_ALL(ret) && attempts < maxCnt);
|
|
if (ret != 0) {
|
|
ibvModifyQpLog(qp, attr->qp_state, attr, attr_mask, qpMsg, sizeof(qpMsg));
|
|
WARN("Call to ibv_modify_qp failed with %d %s, %s", ret, strerror(ret), qpMsg);
|
|
return ncclSystemError;
|
|
}
|
|
return ncclSuccess;
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
|
|
IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibvSymbols, ibv_internal_query_ece, ibv_internal_query_ece(qp, ece), 0, "ibv_query_ece", supported);
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
|
|
IBV_INT_CHECK_RET_ERRNO_OPTIONAL(ibvSymbols, ibv_internal_set_ece, ibv_internal_set_ece(qp, ece), 0, "ibv_set_ece", supported);
|
|
}
|
|
|
|
ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event) {
|
|
*ret = (char *) ibvSymbols.ibv_internal_event_type_str(event);
|
|
return ncclSuccess;
|
|
}
|