add capter4
This commit is contained in:
parent
7e6d50d473
commit
b062aa39a2
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,93 @@
|
|||
# CUDA 程序的错误检测
|
||||
|
||||
------
|
||||
|
||||
## 检测 CUDA 运行时错误的宏函数
|
||||
|
||||
定义检查 cuda 运行时 API 返回值 `cudaError_t` 的宏函数。
|
||||
|
||||
#define CHECK(call) \
|
||||
do { \
|
||||
const cudaError_t error_code = call; \
|
||||
if (error_code != cudaSuccess) \
|
||||
{ \
|
||||
printf("CUDA ERROR: \n"); \
|
||||
printf(" FILE: %s\n", __FILE__); \
|
||||
printf(" LINE: %d\n", __LINE__); \
|
||||
printf(" ERROR CODE: %d\n", error_code); \
|
||||
printf(" ERROR TEXT: %s\n", cudaGetErrorString(error_code)); \
|
||||
exit(1); \
|
||||
} \
|
||||
}while(0);
|
||||
|
||||
因为核函数没有返回值,所以无法直接检查核函数错误。间接的方法是,在调用核函数后执行:
|
||||
|
||||
CHECK(cudaGetLastError()); // 捕捉同步前的最后一个错误。
|
||||
CHECK(cudaDeviceSynchronize()); // 同步主机和设备。
|
||||
|
||||
核函数的调用是 **异步的**,即主机调用核函数后不会等待核函数执行完成、而是立刻执行之后的语句。
|
||||
同步操作较为耗时,一般尽量避免;同时,只要在核函数调用后还有对其他任何能返回错误值的 API
|
||||
函数进行同步调用,都会触发主机和设备的同步并捕捉到核函数中可能发生的错误。
|
||||
|
||||
此外,主机和设备之间的数据拷贝会隐式地同步主机和设备。一般要获得精确的出错位置,还是需要显式地
|
||||
同步,例如调用 `cudaDeviceSynchronize()`。
|
||||
|
||||
或者,通过设置环境变量 `CUDA_LAUNCH_BLOCKING` 为 1,这样所有核函数的调用都将不再是异步的,
|
||||
而是同步的。就是说,主机调用一个核函数之后必须等待其执行完,才能向下执行。
|
||||
一般仅用于程序调试。
|
||||
|
||||
------
|
||||
|
||||
## CUDA-MEMCHECK 检查内存错误
|
||||
|
||||
CUDA 提供了 CUDA-MEMCHECK 的工具集,包括 memcheck, racecheck, initcheck, synccheck.
|
||||
|
||||
>> cuda-memcheck --tool memcheck [options] app-name [options]
|
||||
|
||||
对于 memcheck 工具,可以简化为:
|
||||
|
||||
>> cuda-memcheck [options] app-name [options]
|
||||
|
||||
对于本例,可以通过如下方式检测错误:
|
||||
|
||||
>> cuda-memcheck ./bin/check.exe
|
||||
========= CUDA-MEMCHECK
|
||||
CUDA ERROR:
|
||||
FILE: check.cu
|
||||
LINE: 56
|
||||
ERROR CODE: 9
|
||||
ERROR TEXT: invalid configuration argument
|
||||
========= Program hit cudaErrorInvalidConfiguration (error 9) due to "invalid configuration argument" on CUDA API call to cudaLaunchKernel.
|
||||
========= Saved host backtrace up to driver entry point at error
|
||||
========= Host Frame:C:\Windows\system32\DriverStore\FileRepository\nvhq.inf_amd64_5550755be1247d27\nvcuda64.dll (cuProfilerStop + 0x97c18) [0x2b8ca8]
|
||||
========= Host Frame:C:\Windows\system32\DriverStore\FileRepository\nvhq.inf_amd64_5550755be1247d27\nvcuda64.dll (cuProfilerStop + 0x9a2da) [0x2bb36a]
|
||||
========= Host Frame:C:\Windows\system32\DriverStore\FileRepository\nvhq.inf_amd64_5550755be1247d27\nvcuda64.dll [0x7b52e]
|
||||
========= Host Frame:C:\Windows\system32\DriverStore\FileRepository\nvhq.inf_amd64_5550755be1247d27\nvcuda64.dll (cuProfilerStop + 0x11ceaa) [0x33df3a]
|
||||
========= Host Frame:C:\Windows\system32\DriverStore\FileRepository\nvhq.inf_amd64_5550755be1247d27\nvcuda64.dll (cuProfilerStop + 0x137532) [0x3585c2]
|
||||
========= Host Frame:D:\3_codes\CudaSteps\capter4\bin\check.exe [0x1679]
|
||||
========= Host Frame:D:\3_codes\CudaSteps\capter4\bin\check.exe [0xd32b]
|
||||
========= Host Frame:D:\3_codes\CudaSteps\capter4\bin\check.exe [0xd1a8]
|
||||
========= Host Frame:D:\3_codes\CudaSteps\capter4\bin\check.exe [0xc6a1]
|
||||
========= Host Frame:D:\3_codes\CudaSteps\capter4\bin\check.exe [0xcbf8]
|
||||
========= Host Frame:D:\3_codes\CudaSteps\capter4\bin\check.exe [0xd944]
|
||||
========= Host Frame:C:\Windows\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x17034]
|
||||
========= Host Frame:C:\Windows\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x52651]
|
||||
=========
|
||||
========= Program hit cudaErrorInvalidConfiguration (error 9) due to "invalid configuration argument" on CUDA API call to cudaGetLastError.
|
||||
========= Saved host backtrace up to driver entry point at error
|
||||
========= Host Frame:C:\Windows\system32\DriverStore\FileRepository\nvhq.inf_amd64_5550755be1247d27\nvcuda64.dll (cuProfilerStop + 0x97c18) [0x2b8ca8]
|
||||
========= Host Frame:C:\Windows\system32\DriverStore\FileRepository\nvhq.inf_amd64_5550755be1247d27\nvcuda64.dll (cuProfilerStop + 0x9a2da) [0x2bb36a]
|
||||
========= Host Frame:C:\Windows\system32\DriverStore\FileRepository\nvhq.inf_amd64_5550755be1247d27\nvcuda64.dll [0x7b52e]
|
||||
========= Host Frame:C:\Windows\system32\DriverStore\FileRepository\nvhq.inf_amd64_5550755be1247d27\nvcuda64.dll (cuProfilerStop + 0x11ceaa) [0x33df3a]
|
||||
========= Host Frame:C:\Windows\system32\DriverStore\FileRepository\nvhq.inf_amd64_5550755be1247d27\nvcuda64.dll (cuProfilerStop + 0x137532) [0x3585c2]
|
||||
========= Host Frame:D:\3_codes\CudaSteps\capter4\bin\check.exe [0x1461]
|
||||
========= Host Frame:D:\3_codes\CudaSteps\capter4\bin\check.exe [0xcbfd]
|
||||
========= Host Frame:D:\3_codes\CudaSteps\capter4\bin\check.exe [0xd944]
|
||||
========= Host Frame:C:\Windows\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x17034]
|
||||
========= Host Frame:C:\Windows\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x52651]
|
||||
=========
|
||||
========= ERROR SUMMARY: 2 errors
|
||||
|
||||
关于 CUDA-MEMCHECK 的更多内容,详见: [CUDA-MEMCHECK](https://docs.nvidia.com/cuda/cuda-memcheck/index.html)。
|
||||
|
||||
------
|
|
@ -0,0 +1,122 @@
|
|||
#include "error.cuh"
|
||||
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
const double EPSILON = 1.0e-10;
|
||||
const double a = 1.23;
|
||||
const double b = 2.34;
|
||||
const double c = 3.57;
|
||||
|
||||
|
||||
// 核函数。
|
||||
__global__ void add(const double *x, const double *y, double *z, const int N);
|
||||
|
||||
// 重载设备函数。
|
||||
__device__ double add_in_device(const double x, const double y);
|
||||
__device__ void add_in_device(const double x, const double y, double &z);
|
||||
|
||||
// 主机函数。
|
||||
void check(const double *z, const int N);
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
const int N = 1e4;
|
||||
const int M = sizeof(double) * N;
|
||||
|
||||
// 申请主机内存。
|
||||
double *h_x = new double[N];
|
||||
double *h_y = (double*) malloc(M);
|
||||
double *h_z = (double*) malloc(M);
|
||||
|
||||
// 初始化主机数据。
|
||||
for (int i = 0; i < N; ++i)
|
||||
{
|
||||
h_x[i] = a;
|
||||
h_y[i] = b;
|
||||
}
|
||||
|
||||
// 申请设备内存。
|
||||
double *d_x, *d_y, *d_z;
|
||||
CHECK(cudaMalloc((void**)&d_x, M));
|
||||
CHECK(cudaMalloc((void**)&d_y, M));
|
||||
CHECK(cudaMalloc((void**)&d_z, M));
|
||||
|
||||
// 从主机复制数据到设备。
|
||||
CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
|
||||
CHECK(cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice));
|
||||
|
||||
// 在设备中执行计算。
|
||||
const int block_size = 128;
|
||||
const int grid_size = N/128 + 1;
|
||||
add<<<grid_size, block_size>>>(d_x, d_y, d_z, N);
|
||||
CHECK(cudaGetLastError()); // 捕捉同步前的最后一个错误。
|
||||
CHECK(cudaDeviceSynchronize()); // 同步以捕获核函数错误。
|
||||
|
||||
// 从设备复制数据到主机。
|
||||
CHECK(cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost));
|
||||
check(h_z, N);
|
||||
|
||||
// 释放主机内存。
|
||||
if (h_x) delete[] h_x;
|
||||
free(h_y);
|
||||
free(h_z);
|
||||
|
||||
// 释放设备内存。
|
||||
CHECK(cudaFree(d_x));
|
||||
CHECK(cudaFree(d_y));
|
||||
CHECK(cudaFree(d_z));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
__global__ void add(const double *x, const double *y, double *z, const int N)
|
||||
{
|
||||
// 在主机函数中需要依次对每个元素进行操作,需要使用一个循环。
|
||||
// 在设备函数中,因为采用“单指令-多线程”方式,所以可以去掉循环、只要将数组元素索引和线程索引一一对应即可。
|
||||
|
||||
const int n = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (n > N) return;
|
||||
|
||||
if (n%5 == 0)
|
||||
{
|
||||
z[n] = add_in_device(x[n], y[n]);
|
||||
}
|
||||
else
|
||||
{
|
||||
add_in_device(x[n], y[n], z[n]);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ double add_in_device(const double x, const double y)
|
||||
{
|
||||
return x + y;
|
||||
}
|
||||
|
||||
__device__ void add_in_device(const double x, const double y, double &z)
|
||||
{
|
||||
z = x + y;
|
||||
}
|
||||
|
||||
void check(const double *z, const int N)
|
||||
{
|
||||
bool has_error = false;
|
||||
for (int i = 0; i < N ;++i)
|
||||
{
|
||||
if (fabs(z[i] - c) > EPSILON)
|
||||
{
|
||||
has_error = true;
|
||||
}
|
||||
}
|
||||
|
||||
printf("cuda; %s\n", has_error ? "has error" : "no error");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
#pragma once
|
||||
#include <stdio.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <device_launch_parameters.h>
|
||||
|
||||
#define CHECK(call) \
|
||||
do { \
|
||||
const cudaError_t error_code = call; \
|
||||
if (error_code != cudaSuccess) \
|
||||
{ \
|
||||
printf("CUDA ERROR: \n"); \
|
||||
printf(" FILE: %s\n", __FILE__); \
|
||||
printf(" LINE: %d\n", __LINE__); \
|
||||
printf(" ERROR CODE: %d\n", error_code); \
|
||||
printf(" ERROR TEXT: %s\n", cudaGetErrorString(error_code)); \
|
||||
exit(1); \
|
||||
} \
|
||||
}while(0); \
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue