mirror of https://github.com/Jittor/Jittor
422 lines
11 KiB
C++
422 lines
11 KiB
C++
/**
|
|
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
|
|
*
|
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
|
* with this source code for terms and conditions that govern your use of
|
|
* this software. Any use, reproduction, disclosure, or distribution of
|
|
* this software and related documentation outside the terms of the EULA
|
|
* is strictly prohibited.
|
|
*
|
|
*/
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// These are CUDA Helper functions for initialization and error checking
|
|
|
|
#ifndef COMMON_HELPER_CUDA_H_
|
|
#define COMMON_HELPER_CUDA_H_
|
|
|
|
#pragma once
|
|
|
|
#include "utils/log.h"
|
|
|
|
#include <stdexcept>
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include <helper_string.h>
|
|
|
|
#ifndef EXIT_WAIVED
|
|
#define EXIT_WAIVED 2
|
|
#endif
|
|
|
|
// Note, it is required that your SDK sample to include the proper header
|
|
// files, please refer the CUDA examples for examples of the needed CUDA
|
|
// headers, which may change depending on which CUDA functions are used.
|
|
|
|
// CUDA Runtime error messages
|
|
#ifdef __DRIVER_TYPES_H__
|
|
inline const char *_cudaGetErrorEnum(cudaError_t error) {
|
|
return cudaGetErrorName(error);
|
|
}
|
|
#endif
|
|
|
|
// CUDA Driver API errors
|
|
#ifdef CUDA_DRIVER_API
|
|
inline const char *_cudaGetErrorEnum(CUresult error) {
|
|
const char *ret = NULL;
|
|
cuGetErrorName(error, &ret);
|
|
return ret ? ret : "<unknown>";
|
|
}
|
|
#endif
|
|
|
|
#ifdef CUBLAS_API_H_
|
|
// cuBLAS API errors
|
|
const char *_cudaGetErrorEnum(cublasStatus_t error);
|
|
#endif
|
|
|
|
#ifdef CUDNN_H_
|
|
// cudnn API errors
|
|
const char *_cudaGetErrorEnum(cudnnStatus_t error);
|
|
#endif
|
|
|
|
#ifdef _CUFFT_H_
|
|
// cuFFT API errors
|
|
const char *_cudaGetErrorEnum(cufftResult error);
|
|
#endif
|
|
|
|
#ifdef CUSPARSEAPI
|
|
// cuSPARSE API errors
|
|
const char *_cudaGetErrorEnum(cusparseStatus_t error);
|
|
#endif
|
|
|
|
#ifdef CUSOLVER_COMMON_H_
|
|
// cuSOLVER API errors
|
|
const char *_cudaGetErrorEnum(cusolverStatus_t error);
|
|
#endif
|
|
|
|
#ifdef CURAND_H_
|
|
// cuRAND API errors
|
|
const char *_cudaGetErrorEnum(curandStatus_t error);
|
|
#endif
|
|
|
|
#ifdef NCCL_H_
|
|
// cuRAND API errors
|
|
const char *_cudaGetErrorEnum(ncclResult_t error);
|
|
#endif
|
|
|
|
#ifdef NV_NPPIDEFS_H
|
|
// NPP API errors
|
|
const char *_cudaGetErrorEnum(NppStatus error);
|
|
#endif
|
|
|
|
#ifdef __DRIVER_TYPES_H__
|
|
#ifndef DEVICE_RESET
|
|
#define DEVICE_RESET cudaDeviceReset();
|
|
#endif
|
|
#else
|
|
#ifndef DEVICE_RESET
|
|
#define DEVICE_RESET
|
|
#endif
|
|
#endif
|
|
|
|
template <typename T>
|
|
void check(T result, char const *const func, const char *const file,
|
|
int const line) {
|
|
if (result) {
|
|
DEVICE_RESET
|
|
LOGf << "CUDA error at" << file >> ":" >> line << " code="
|
|
>> static_cast<unsigned int>(result) >> "(" << _cudaGetErrorEnum(result) << ")"
|
|
<< func;
|
|
}
|
|
}
|
|
|
|
#ifdef __DRIVER_TYPES_H__
|
|
// This will output the proper CUDA error strings in the event
|
|
// that a CUDA host call returns an error
|
|
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
|
|
|
|
// This will output the proper error string when calling cudaGetLastError
|
|
#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
|
|
|
|
inline void __getLastCudaError(const char *errorMessage, const char *file,
|
|
const int line) {
|
|
cudaError_t err = cudaGetLastError();
|
|
|
|
if (cudaSuccess != err) {
|
|
DEVICE_RESET
|
|
LOGf << "CUDA error at" << file >> ":" >> line << " code="
|
|
>> static_cast<unsigned int>(err) >> "(" << _cudaGetErrorEnum(err) << ")"
|
|
<< errorMessage;
|
|
}
|
|
}
|
|
|
|
// This will only print the proper error string when calling cudaGetLastError
|
|
// but not exit program incase error detected.
|
|
#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
|
|
|
|
inline void __printLastCudaError(const char *errorMessage, const char *file,
|
|
const int line) {
|
|
cudaError_t err = cudaGetLastError();
|
|
|
|
if (cudaSuccess != err) {
|
|
DEVICE_RESET
|
|
LOGf << "CUDA error at" << file >> ":" >> line << " code="
|
|
>> static_cast<unsigned int>(err) >> "(" << _cudaGetErrorEnum(err) << ")"
|
|
<< errorMessage;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#ifndef MAX
|
|
#define MAX(a, b) (a > b ? a : b)
|
|
#endif
|
|
|
|
// Float To Int conversion
|
|
inline int ftoi(float value) {
|
|
return (value >= 0 ? static_cast<int>(value + 0.5)
|
|
: static_cast<int>(value - 0.5));
|
|
}
|
|
|
|
// Beginning of GPU Architecture definitions
|
|
inline int _ConvertSMVer2Cores(int major, int minor) {
|
|
// Defines for GPU Architecture types (using the SM version to determine
|
|
// the # of cores per SM
|
|
typedef struct {
|
|
int SM; // 0xMm (hexidecimal notation), M = SM Major version,
|
|
// and m = SM minor version
|
|
int Cores;
|
|
} sSMtoCores;
|
|
|
|
sSMtoCores nGpuArchCoresPerSM[] = {
|
|
{0x30, 192},
|
|
{0x32, 192},
|
|
{0x35, 192},
|
|
{0x37, 192},
|
|
{0x50, 128},
|
|
{0x52, 128},
|
|
{0x53, 128},
|
|
{0x60, 64},
|
|
{0x61, 128},
|
|
{0x62, 128},
|
|
{0x70, 64},
|
|
{0x72, 64},
|
|
{0x75, 64},
|
|
{-1, -1}};
|
|
|
|
int index = 0;
|
|
|
|
while (nGpuArchCoresPerSM[index].SM != -1) {
|
|
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
|
|
return nGpuArchCoresPerSM[index].Cores;
|
|
}
|
|
|
|
index++;
|
|
}
|
|
|
|
// If we don't find the values, we default use the previous one
|
|
// to run properly
|
|
printf(
|
|
"MapSMtoCores for SM %d.%d is undefined."
|
|
" Default to use %d Cores/SM\n",
|
|
major, minor, nGpuArchCoresPerSM[index - 1].Cores);
|
|
return nGpuArchCoresPerSM[index - 1].Cores;
|
|
}
|
|
// end of GPU Architecture definitions
|
|
|
|
#ifdef __CUDA_RUNTIME_H__
|
|
// General GPU Device CUDA Initialization
|
|
inline int gpuDeviceInit(int devID) {
|
|
int device_count;
|
|
checkCudaErrors(cudaGetDeviceCount(&device_count));
|
|
|
|
if (device_count == 0) {
|
|
fprintf(stderr,
|
|
"gpuDeviceInit() CUDA error: "
|
|
"no devices supporting CUDA.\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
if (devID < 0) {
|
|
devID = 0;
|
|
}
|
|
|
|
if (devID > device_count - 1) {
|
|
fprintf(stderr, "\n");
|
|
fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
|
|
device_count);
|
|
fprintf(stderr,
|
|
">> gpuDeviceInit (-device=%d) is not a valid"
|
|
" GPU device. <<\n",
|
|
devID);
|
|
fprintf(stderr, "\n");
|
|
return -devID;
|
|
}
|
|
|
|
cudaDeviceProp deviceProp;
|
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
|
|
|
|
if (deviceProp.computeMode == cudaComputeModeProhibited) {
|
|
fprintf(stderr,
|
|
"Error: device is running in <Compute Mode "
|
|
"Prohibited>, no threads can use cudaSetDevice().\n");
|
|
return -1;
|
|
}
|
|
|
|
if (deviceProp.major < 1) {
|
|
fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
checkCudaErrors(cudaSetDevice(devID));
|
|
printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);
|
|
|
|
return devID;
|
|
}
|
|
|
|
// This function returns the best GPU (with maximum GFLOPS)
|
|
inline int gpuGetMaxGflopsDeviceId() {
|
|
int current_device = 0, sm_per_multiproc = 0;
|
|
int max_perf_device = 0;
|
|
int device_count = 0;
|
|
int devices_prohibited = 0;
|
|
|
|
uint64_t max_compute_perf = 0;
|
|
cudaDeviceProp deviceProp;
|
|
checkCudaErrors(cudaGetDeviceCount(&device_count));
|
|
|
|
if (device_count == 0) {
|
|
fprintf(stderr,
|
|
"gpuGetMaxGflopsDeviceId() CUDA error:"
|
|
" no devices supporting CUDA.\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
// Find the best CUDA capable GPU device
|
|
current_device = 0;
|
|
|
|
while (current_device < device_count) {
|
|
cudaGetDeviceProperties(&deviceProp, current_device);
|
|
|
|
// If this GPU is not running on Compute Mode prohibited,
|
|
// then we can add it to the list
|
|
if (deviceProp.computeMode != cudaComputeModeProhibited) {
|
|
if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
|
|
sm_per_multiproc = 1;
|
|
} else {
|
|
sm_per_multiproc =
|
|
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
|
|
}
|
|
|
|
uint64_t compute_perf = (uint64_t)deviceProp.multiProcessorCount *
|
|
sm_per_multiproc * deviceProp.clockRate;
|
|
|
|
if (compute_perf > max_compute_perf) {
|
|
max_compute_perf = compute_perf;
|
|
max_perf_device = current_device;
|
|
}
|
|
} else {
|
|
devices_prohibited++;
|
|
}
|
|
|
|
++current_device;
|
|
}
|
|
|
|
if (devices_prohibited == device_count) {
|
|
fprintf(stderr,
|
|
"gpuGetMaxGflopsDeviceId() CUDA error:"
|
|
" all devices have compute mode prohibited.\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
return max_perf_device;
|
|
}
|
|
|
|
// Initialization code to find the best CUDA Device
|
|
inline int findCudaDevice(int argc, const char **argv) {
|
|
cudaDeviceProp deviceProp;
|
|
int devID = 0;
|
|
|
|
// If the command-line has a device number specified, use it
|
|
if (checkCmdLineFlag(argc, argv, "device")) {
|
|
devID = getCmdLineArgumentInt(argc, argv, "device=");
|
|
|
|
if (devID < 0) {
|
|
printf("Invalid command line parameter\n ");
|
|
exit(EXIT_FAILURE);
|
|
} else {
|
|
devID = gpuDeviceInit(devID);
|
|
|
|
if (devID < 0) {
|
|
printf("exiting...\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
} else {
|
|
// Otherwise pick the device with highest Gflops/s
|
|
devID = gpuGetMaxGflopsDeviceId();
|
|
checkCudaErrors(cudaSetDevice(devID));
|
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
|
|
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID,
|
|
deviceProp.name, deviceProp.major, deviceProp.minor);
|
|
}
|
|
|
|
return devID;
|
|
}
|
|
|
|
inline int findIntegratedGPU() {
|
|
int current_device = 0;
|
|
int device_count = 0;
|
|
int devices_prohibited = 0;
|
|
|
|
cudaDeviceProp deviceProp;
|
|
checkCudaErrors(cudaGetDeviceCount(&device_count));
|
|
|
|
if (device_count == 0) {
|
|
fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
// Find the integrated GPU which is compute capable
|
|
while (current_device < device_count) {
|
|
cudaGetDeviceProperties(&deviceProp, current_device);
|
|
|
|
// If GPU is integrated and is not running on Compute Mode prohibited,
|
|
// then cuda can map to GLES resource
|
|
if (deviceProp.integrated &&
|
|
(deviceProp.computeMode != cudaComputeModeProhibited)) {
|
|
checkCudaErrors(cudaSetDevice(current_device));
|
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device));
|
|
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
|
|
current_device, deviceProp.name, deviceProp.major,
|
|
deviceProp.minor);
|
|
|
|
return current_device;
|
|
} else {
|
|
devices_prohibited++;
|
|
}
|
|
|
|
current_device++;
|
|
}
|
|
|
|
if (devices_prohibited == device_count) {
|
|
fprintf(stderr,
|
|
"CUDA error:"
|
|
" No GLES-CUDA Interop capable GPU found.\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
// General check for CUDA GPU SM Capabilities
|
|
inline bool checkCudaCapabilities(int major_version, int minor_version) {
|
|
cudaDeviceProp deviceProp;
|
|
deviceProp.major = 0;
|
|
deviceProp.minor = 0;
|
|
int dev;
|
|
|
|
checkCudaErrors(cudaGetDevice(&dev));
|
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
|
|
|
|
if ((deviceProp.major > major_version) ||
|
|
(deviceProp.major == major_version &&
|
|
deviceProp.minor >= minor_version)) {
|
|
printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
|
|
deviceProp.name, deviceProp.major, deviceProp.minor);
|
|
return true;
|
|
} else {
|
|
printf(
|
|
" No GPU device was found that can support "
|
|
"CUDA compute capability %d.%d.\n",
|
|
major_version, minor_version);
|
|
return false;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
// end of CUDA Helper Functions
|
|
|
|
#endif // COMMON_HELPER_CUDA_H_
|