mirror of https://github.com/NVIDIA/nccl.git
Merge b1c424d1a6
into 593de54e52
This commit is contained in:
commit
6ab8cc2635
|
@ -0,0 +1,24 @@
|
|||
cmake_minimum_required(VERSION 4.0)
|
||||
|
||||
project(nccl LANGUAGES CUDA CXX VERSION 2.27.7)
|
||||
|
||||
option(VERBOSE "VERBOSE" OFF)
|
||||
option(KEEP "KEEP" OFF)
|
||||
option(TRACE "TRACE" OFF)
|
||||
option(PROFAPI "PROFAPI" OFF)
|
||||
option(NVTX "NVTX" ON)
|
||||
option(NET_PROFILER "NET_PROFILER" OFF)
|
||||
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||
add_subdirectory(src)
|
||||
|
||||
install(
|
||||
TARGETS nccl nccl_static
|
||||
EXPORT NCCLConfig
|
||||
FILE_SET public_headers
|
||||
DESTINATION include)
|
||||
|
||||
install(
|
||||
EXPORT NCCLConfig
|
||||
DESTINATION lib/cmake/nccl
|
||||
NAMESPACE NCCL::)
|
|
@ -0,0 +1,39 @@
|
|||
function(nccl_add_target_options target)
|
||||
target_compile_options(${target} PRIVATE $<$<CONFIG:Debug>:-ggdb3>)
|
||||
target_compile_options(${target} PRIVATE $<$<NOT:$<CONFIG:Debug>>:-O3>)
|
||||
target_compile_options(
|
||||
${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda -Xptxas
|
||||
-maxrregcount=96 -Xfatbin -compress-all -fPIC>)
|
||||
target_compile_options(${target} PRIVATE -fPIC -Wall -Wno-unused-function
|
||||
-Wno-sign-compare -Wvla)
|
||||
set_property(TARGET ${target} PROPERTY CXX_STANDARD 17)
|
||||
set_property(TARGET ${target} PROPERTY CUDA_STANDARD 17)
|
||||
set_property(TARGET ${target} PROPERTY CXX_VISIBILITY_PRESET hidden)
|
||||
set_property(TARGET ${target} PROPERTY VISIBILITY_INLINES_HIDDEN 1)
|
||||
set_property(TARGET ${target} PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
||||
if(VERBOSE)
|
||||
target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xptxas
|
||||
-v -Xcompiler -Wall,-Wextra>)
|
||||
target_compile_options(${target} PRIVATE -Wall -Wextra)
|
||||
endif()
|
||||
|
||||
if(TRACE)
|
||||
target_compile_options(${target} PRIVATE ENABLE_TRACE)
|
||||
endif()
|
||||
|
||||
if(NOT NVTX)
|
||||
target_compile_options(${target} PRIVATE NVTX_DISABLE)
|
||||
endif()
|
||||
|
||||
if(KEEP)
|
||||
target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-keep>)
|
||||
endif()
|
||||
|
||||
if(PROFAPI)
|
||||
target_compile_options(${target} PRIVATE PROFAPI)
|
||||
endif()
|
||||
|
||||
if(NET_PROFILER)
|
||||
target_compile_options(${target} PRIVATE NET_PROFILER)
|
||||
endif()
|
||||
endfunction()
|
|
@ -0,0 +1,51 @@
|
|||
include(../cmake/common.cmake)
|
||||
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
|
||||
set(nccl_Major ${nccl_VERSION_MAJOR})
|
||||
set(nccl_Minor ${nccl_VERSION_MINOR})
|
||||
set(nccl_Patch ${nccl_VERSION_PATCH})
|
||||
# NCCL_VERSION(X,Y,Z) ((X) * 10000 + (Y) * 100 + (Z))
|
||||
math(
|
||||
EXPR
|
||||
nccl_Version
|
||||
"${nccl_VERSION_MAJOR} * 10000 + ${nccl_VERSION_MINOR} * 100 + ${nccl_VERSION_PATCH}"
|
||||
)
|
||||
set(nccl_Suffix)
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/nccl.h.in
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/include/nccl.h)
|
||||
|
||||
file(
|
||||
GLOB
|
||||
SRC_FILES
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/*.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/misc/*.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/transport/*.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/collectives/*.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/graph/*.cc")
|
||||
|
||||
set(HEADER_FILES "${CMAKE_CURRENT_SOURCE_DIR}/include/nccl.h")
|
||||
|
||||
set(NCCL_LIBS nccl;nccl_static)
|
||||
|
||||
add_library(nccl SHARED ${SRC_FILES})
|
||||
add_library(nccl_static STATIC ${SRC_FILES})
|
||||
|
||||
foreach(lib_name IN LISTS NCCL_LIBS)
|
||||
nccl_add_target_options(${lib_name})
|
||||
target_include_directories(
|
||||
${lib_name}
|
||||
PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/plugin>)
|
||||
target_include_directories(${lib_name} PRIVATE "${CUDAToolkit_INCLUDE_DIRS}")
|
||||
target_sources(
|
||||
${lib_name}
|
||||
PUBLIC FILE_SET
|
||||
public_headers
|
||||
TYPE
|
||||
HEADERS
|
||||
BASE_DIRS
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}"
|
||||
FILES
|
||||
${HEADER_FILES})
|
||||
endforeach()
|
|
@ -0,0 +1,35 @@
|
|||
set(CU_FILES onerank_reduce.cu functions.cu)
|
||||
|
||||
add_library(colldevice OBJECT ${CU_FILES})
|
||||
|
||||
set(datatypes "i8;u8;i32;u32;i64;u64;f16;f32;f64")
|
||||
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11")
|
||||
list(APPEND datatypes bf16)
|
||||
endif()
|
||||
set(ops "sum;prod;min;max;premulsum;sumpostdiv")
|
||||
list(LENGTH ops op_num)
|
||||
math(EXPR op_num "${op_num} - 1")
|
||||
list(LENGTH datatypes datatype_num)
|
||||
math(EXPR datatype_num "${datatype_num} - 1")
|
||||
set(base_files "sendrecv;all_reduce;all_gather;broadcast;reduce;reduce_scatter")
|
||||
foreach(base IN LISTS base_files)
|
||||
foreach(opn RANGE ${op_num})
|
||||
list(GET ops ${opn} op)
|
||||
foreach(dtn RANGE ${datatype_num})
|
||||
list(GET datatypes ${dtn} dt)
|
||||
set(new_file ${CMAKE_CURRENT_BINARY_DIR}/${base}_${op}_${dt}.cu)
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${base}.cu ${new_file}
|
||||
COPYONLY)
|
||||
set_property(SOURCE ${new_file} PROPERTY COMPILE_DEFINITIONS
|
||||
NCCL_OP=${opn} NCCL_TYPE=${dtn})
|
||||
target_sources(colldevice PRIVATE ${new_file})
|
||||
endforeach()
|
||||
endforeach()
|
||||
endforeach()
|
||||
|
||||
target_include_directories(
|
||||
colldevice PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include
|
||||
${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
# Compiled kernels and collectives with relocatable device code ...
|
||||
set_property(TARGET colldevice PROPERTY CUDA_SEPARABLE_COMPILATION ON)
|
|
@ -16,12 +16,12 @@
|
|||
#include <cuda_fp8.h>
|
||||
#endif
|
||||
|
||||
#define NCCL_MAJOR ${nccl:Major}
|
||||
#define NCCL_MINOR ${nccl:Minor}
|
||||
#define NCCL_PATCH ${nccl:Patch}
|
||||
#define NCCL_SUFFIX "${nccl:Suffix}"
|
||||
#define NCCL_MAJOR ${nccl_Major}
|
||||
#define NCCL_MINOR ${nccl_Minor}
|
||||
#define NCCL_PATCH ${nccl_Patch}
|
||||
#define NCCL_SUFFIX "${nccl_Suffix}"
|
||||
|
||||
#define NCCL_VERSION_CODE ${nccl:Version}
|
||||
#define NCCL_VERSION_CODE ${nccl_Version}
|
||||
#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
Loading…
Reference in New Issue