JittorMirror/extern/cuda/inc/helper_cuda.h

422 lines
11 KiB
C++

/**
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions for initialization and error checking
#ifndef COMMON_HELPER_CUDA_H_
#define COMMON_HELPER_CUDA_H_
#pragma once
#include "utils/log.h"
#include <stdexcept>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <helper_string.h>
#ifndef EXIT_WAIVED
#define EXIT_WAIVED 2
#endif
// Note, it is required that your SDK sample to include the proper header
// files, please refer the CUDA examples for examples of the needed CUDA
// headers, which may change depending on which CUDA functions are used.
// CUDA Runtime error messages
#ifdef __DRIVER_TYPES_H__
inline const char *_cudaGetErrorEnum(cudaError_t error) {
return cudaGetErrorName(error);
}
#endif
// CUDA Driver API errors
#ifdef CUDA_DRIVER_API
inline const char *_cudaGetErrorEnum(CUresult error) {
const char *ret = NULL;
cuGetErrorName(error, &ret);
return ret ? ret : "<unknown>";
}
#endif
#ifdef CUBLAS_API_H_
// cuBLAS API errors
const char *_cudaGetErrorEnum(cublasStatus_t error);
#endif
#ifdef CUDNN_H_
// cudnn API errors
const char *_cudaGetErrorEnum(cudnnStatus_t error);
#endif
#ifdef _CUFFT_H_
// cuFFT API errors
const char *_cudaGetErrorEnum(cufftResult error);
#endif
#ifdef CUSPARSEAPI
// cuSPARSE API errors
const char *_cudaGetErrorEnum(cusparseStatus_t error);
#endif
#ifdef CUSOLVER_COMMON_H_
// cuSOLVER API errors
const char *_cudaGetErrorEnum(cusolverStatus_t error);
#endif
#ifdef CURAND_H_
// cuRAND API errors
const char *_cudaGetErrorEnum(curandStatus_t error);
#endif
#ifdef NCCL_H_
// cuRAND API errors
const char *_cudaGetErrorEnum(ncclResult_t error);
#endif
#ifdef NV_NPPIDEFS_H
// NPP API errors
const char *_cudaGetErrorEnum(NppStatus error);
#endif
#ifdef __DRIVER_TYPES_H__
#ifndef DEVICE_RESET
#define DEVICE_RESET cudaDeviceReset();
#endif
#else
#ifndef DEVICE_RESET
#define DEVICE_RESET
#endif
#endif
template <typename T>
void check(T result, char const *const func, const char *const file,
int const line) {
if (result) {
DEVICE_RESET
LOGf << "CUDA error at" << file >> ":" >> line << " code="
>> static_cast<unsigned int>(result) >> "(" << _cudaGetErrorEnum(result) << ")"
<< func;
}
}
#ifdef __DRIVER_TYPES_H__
// This will output the proper CUDA error strings in the event
// that a CUDA host call returns an error
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
inline void __getLastCudaError(const char *errorMessage, const char *file,
const int line) {
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err) {
DEVICE_RESET
LOGf << "CUDA error at" << file >> ":" >> line << " code="
>> static_cast<unsigned int>(err) >> "(" << _cudaGetErrorEnum(err) << ")"
<< errorMessage;
}
}
// This will only print the proper error string when calling cudaGetLastError
// but not exit program incase error detected.
#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
inline void __printLastCudaError(const char *errorMessage, const char *file,
const int line) {
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err) {
DEVICE_RESET
LOGf << "CUDA error at" << file >> ":" >> line << " code="
>> static_cast<unsigned int>(err) >> "(" << _cudaGetErrorEnum(err) << ")"
<< errorMessage;
}
}
#endif
#ifndef MAX
#define MAX(a, b) (a > b ? a : b)
#endif
// Float To Int conversion
inline int ftoi(float value) {
return (value >= 0 ? static_cast<int>(value + 0.5)
: static_cast<int>(value - 0.5));
}
// Beginning of GPU Architecture definitions
inline int _ConvertSMVer2Cores(int major, int minor) {
// Defines for GPU Architecture types (using the SM version to determine
// the # of cores per SM
typedef struct {
int SM; // 0xMm (hexidecimal notation), M = SM Major version,
// and m = SM minor version
int Cores;
} sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] = {
{0x30, 192},
{0x32, 192},
{0x35, 192},
{0x37, 192},
{0x50, 128},
{0x52, 128},
{0x53, 128},
{0x60, 64},
{0x61, 128},
{0x62, 128},
{0x70, 64},
{0x72, 64},
{0x75, 64},
{-1, -1}};
int index = 0;
while (nGpuArchCoresPerSM[index].SM != -1) {
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
return nGpuArchCoresPerSM[index].Cores;
}
index++;
}
// If we don't find the values, we default use the previous one
// to run properly
printf(
"MapSMtoCores for SM %d.%d is undefined."
" Default to use %d Cores/SM\n",
major, minor, nGpuArchCoresPerSM[index - 1].Cores);
return nGpuArchCoresPerSM[index - 1].Cores;
}
// end of GPU Architecture definitions
#ifdef __CUDA_RUNTIME_H__
// General GPU Device CUDA Initialization
inline int gpuDeviceInit(int devID) {
int device_count;
checkCudaErrors(cudaGetDeviceCount(&device_count));
if (device_count == 0) {
fprintf(stderr,
"gpuDeviceInit() CUDA error: "
"no devices supporting CUDA.\n");
exit(EXIT_FAILURE);
}
if (devID < 0) {
devID = 0;
}
if (devID > device_count - 1) {
fprintf(stderr, "\n");
fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
device_count);
fprintf(stderr,
">> gpuDeviceInit (-device=%d) is not a valid"
" GPU device. <<\n",
devID);
fprintf(stderr, "\n");
return -devID;
}
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
if (deviceProp.computeMode == cudaComputeModeProhibited) {
fprintf(stderr,
"Error: device is running in <Compute Mode "
"Prohibited>, no threads can use cudaSetDevice().\n");
return -1;
}
if (deviceProp.major < 1) {
fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
exit(EXIT_FAILURE);
}
checkCudaErrors(cudaSetDevice(devID));
printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);
return devID;
}
// This function returns the best GPU (with maximum GFLOPS)
inline int gpuGetMaxGflopsDeviceId() {
int current_device = 0, sm_per_multiproc = 0;
int max_perf_device = 0;
int device_count = 0;
int devices_prohibited = 0;
uint64_t max_compute_perf = 0;
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceCount(&device_count));
if (device_count == 0) {
fprintf(stderr,
"gpuGetMaxGflopsDeviceId() CUDA error:"
" no devices supporting CUDA.\n");
exit(EXIT_FAILURE);
}
// Find the best CUDA capable GPU device
current_device = 0;
while (current_device < device_count) {
cudaGetDeviceProperties(&deviceProp, current_device);
// If this GPU is not running on Compute Mode prohibited,
// then we can add it to the list
if (deviceProp.computeMode != cudaComputeModeProhibited) {
if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
sm_per_multiproc = 1;
} else {
sm_per_multiproc =
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
}
uint64_t compute_perf = (uint64_t)deviceProp.multiProcessorCount *
sm_per_multiproc * deviceProp.clockRate;
if (compute_perf > max_compute_perf) {
max_compute_perf = compute_perf;
max_perf_device = current_device;
}
} else {
devices_prohibited++;
}
++current_device;
}
if (devices_prohibited == device_count) {
fprintf(stderr,
"gpuGetMaxGflopsDeviceId() CUDA error:"
" all devices have compute mode prohibited.\n");
exit(EXIT_FAILURE);
}
return max_perf_device;
}
// Initialization code to find the best CUDA Device
inline int findCudaDevice(int argc, const char **argv) {
cudaDeviceProp deviceProp;
int devID = 0;
// If the command-line has a device number specified, use it
if (checkCmdLineFlag(argc, argv, "device")) {
devID = getCmdLineArgumentInt(argc, argv, "device=");
if (devID < 0) {
printf("Invalid command line parameter\n ");
exit(EXIT_FAILURE);
} else {
devID = gpuDeviceInit(devID);
if (devID < 0) {
printf("exiting...\n");
exit(EXIT_FAILURE);
}
}
} else {
// Otherwise pick the device with highest Gflops/s
devID = gpuGetMaxGflopsDeviceId();
checkCudaErrors(cudaSetDevice(devID));
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID,
deviceProp.name, deviceProp.major, deviceProp.minor);
}
return devID;
}
inline int findIntegratedGPU() {
int current_device = 0;
int device_count = 0;
int devices_prohibited = 0;
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceCount(&device_count));
if (device_count == 0) {
fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
exit(EXIT_FAILURE);
}
// Find the integrated GPU which is compute capable
while (current_device < device_count) {
cudaGetDeviceProperties(&deviceProp, current_device);
// If GPU is integrated and is not running on Compute Mode prohibited,
// then cuda can map to GLES resource
if (deviceProp.integrated &&
(deviceProp.computeMode != cudaComputeModeProhibited)) {
checkCudaErrors(cudaSetDevice(current_device));
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device));
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
current_device, deviceProp.name, deviceProp.major,
deviceProp.minor);
return current_device;
} else {
devices_prohibited++;
}
current_device++;
}
if (devices_prohibited == device_count) {
fprintf(stderr,
"CUDA error:"
" No GLES-CUDA Interop capable GPU found.\n");
exit(EXIT_FAILURE);
}
return -1;
}
// General check for CUDA GPU SM Capabilities
inline bool checkCudaCapabilities(int major_version, int minor_version) {
cudaDeviceProp deviceProp;
deviceProp.major = 0;
deviceProp.minor = 0;
int dev;
checkCudaErrors(cudaGetDevice(&dev));
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
if ((deviceProp.major > major_version) ||
(deviceProp.major == major_version &&
deviceProp.minor >= minor_version)) {
printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
deviceProp.name, deviceProp.major, deviceProp.minor);
return true;
} else {
printf(
" No GPU device was found that can support "
"CUDA compute capability %d.%d.\n",
major_version, minor_version);
return false;
}
}
#endif
// end of CUDA Helper Functions
#endif // COMMON_HELPER_CUDA_H_