From 8b239547ee677df317c37cb9d778d686895da16a Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 6 Dec 2021 22:10:03 +0800 Subject: [PATCH] feat(capter11): add capter11 --- capter11/ReadMe.md | 124 +++++++++++++++++ capter11/stream.cu | 322 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 446 insertions(+) create mode 100644 capter11/ReadMe.md create mode 100644 capter11/stream.cu diff --git a/capter11/ReadMe.md b/capter11/ReadMe.md new file mode 100644 index 0000000..d7012c9 --- /dev/null +++ b/capter11/ReadMe.md @@ -0,0 +1,124 @@ +# CUDA 流 + +一个 CUDA 流一般是指由主机发出的、在设备中执行的cuda操作序列(即和cuda有关的操作, +如主机--设备数据传输和核函数执行)。目前不考虑由设备段发出的流。 + +任何cuda操作都存在于某个cuda流,要么是 **默认流(default stream)**,也称为 **空流**; +要么是明确指定的流。非默认的cuda流(非空流)都是在主机端产生与销毁。 +一个cuda流由类型为 `cudaStream_t` 的变量表示,创建与销毁的方式: + +```cuda +cudaSteam_t stream; +CHECK(cudaStreamCreate(&stream)); +... +CHECK(cudaStreamDestroy(stream)); +``` + +主机中可以产生多个相互独立的cuda流,并实现cuda流之间的并行。 + +为了检查一个cuda流中所有操作是否都已在设备中执行完毕: + +```cuda +cudaError_t cudaStreamSynchronize(cudaStream_t stream); +cudaError_t cudaStreamQuery(cudaStream_t stream); +``` + +`cudaStreamSynchronize` 会强制阻塞主机,直到其中的stream流执行完毕; +`cudaStreamQuery` 不会阻塞主机,只是检查cuda流(stream)是否执行完毕,若是,则返回 `cudaSuccess`; +否则,返回 `cudaErrorNotReady`。 + +------ + +## 在默认流中重叠主机和设备计算 + +**同一个cuda流在设备中都是顺序执行的。** 在数组相加的例子中: + +```cuda +cudaMemcpy(d_x, h_x, M, cudaMemcpyDefault); +cudaMemcpy(d_y, h_y, M, cudaMemcpyDefault); +add<<>>(d_x, d_y, d_z, N); +cudaMemcpy(h_z, d_z, M, cudaMemcpyDefault); +``` + +从设备的角度,以上4个cuda语句是按代码顺序执行的。 + +采用 `cudaMemcpy` 函数在主机与设备间拷贝数据,是具有隐式同步功能的。 +所以从主机的角度看,数据传输是同步的或者说阻塞的,即主机在发出命令: + +```cuda +cudaMemcpy(d_x, h_x, M, cudaMemcpyDefault); +``` + +之后,会等待该命令执行完完毕,再接着往下走;数据传输时,主机是闲置的。 +与此不同的是,核函数的启动是异步的或者说非阻塞的,即在主机发出命令: + +```cuda +add<<>>(d_x, d_y, d_z, N); +``` + +之后,不会等待该命令执行完毕,而是立刻得到程序的控制权。紧接着发出: + +```cuda +cudaMemcpy(h_z, d_z, M, cudaMemcpyDefault); +``` + +然而,该命令不会被立刻执行,因为其与核函数同处默认流,需要顺序执行。 + +所以,主机在发出核函数调用后会立刻发出下一个命令;如果下一个命令是 +主机中的某个计算任务,那么主机就会在设备执行核函数的同时执行计算。 +这样就可以实现主机和设备的重叠计算。 + +当主机和设备的计算量相当时,将主机函数放在设备核函数后可以达到主机函数 +与设备函数并发执行的效果,从而有效地隐藏主机函数的执行时间。 + +------ + +## 非默认 cuda 流重叠多个核函数 + +要实现多个核函数之间的并行必须使用多个非默认 cuda 流。 + +使用多个流相对于使用一个流有加速效果;当流的数目超过某个阈值时,加速比就趋于饱和。 +制约加速比的因素: ++ GPU 计算资源,当核函数的线程总数超过某一值时,再增加流的数目就不会带来更高性能; ++ GPU 中能够并发执行的核函数的上限。 + +指定核函数的cuda流的方法: + +```cuda +kernal_func<<>>(params); +``` + +在调用核函数时,如果不需要使用共享内存,则该项设为0;同时指定cuda流的id。 + +计算能力为7,5的GPU能执行的核函数上限值为128。 + +------ + +## 非默认 cuda 流重叠核函数与数据传递 + +要实现核函数执行与数据传输的并发(重叠),必须让这两个操作处于不同的非默认流; +同时,数据传输需要使用 `cudaMemcpy` 的异步版本 `cudaMemcpyAsync`。 + +异步传输由GPU的DMA(direct memory access)实现,不需要主机的参与。 + +使用异步的数据传输函数时,需要将主机内存定义为不可分页内存或者固定内存,从而 +防止在程序执行期间物理地址被修改。如果将可分页内存传递给 `cudaMemcpyAsync` +则会导致同步传输。 + +主机不可分页内存的分配与释放: + +```cuda +cudaError_t cudaMallocHost(void **ptr, size_t size); +或者 +cudaError_t cudaHostAlloc(void **ptr, size_t size); + +cudaError_t cudaFreeHost(void *ptr); +``` + +要利用多个流提升性能,一种方法是将数据和相应计算操作分为若干等分, +然后在每个流中发布一个cuda操作序列。 + +如果核函数执行、主机与设备间的数据传输这3个cuda操作能完全并行执行, +理论上最大加速比为 3。 + +------ \ No newline at end of file diff --git a/capter11/stream.cu b/capter11/stream.cu new file mode 100644 index 0000000..e098ad1 --- /dev/null +++ b/capter11/stream.cu @@ -0,0 +1,322 @@ +#include "../common/error.cuh" +#include "../common/floats.hpp" +#include +#include + + +const int NUM_REPEATS = 10; +const int N1 = 1024; +const int MAX_NUM_STREAMS = 30; +const int N2 = N1 * MAX_NUM_STREAMS; +const int M2 = sizeof(real) * N2; +cudaStream_t streams[MAX_NUM_STREAMS]; // cuda流数组,全局变量由系统负责销毁。 + + +const int N = 100000000; +const int M = sizeof(real) * N; +const int block_size = 128; +const int grid_size = (N - 1) / block_size + 1; + + +void timing(const real *h_x, const real *h_y, real *h_z, + const real *d_x, const real *d_y, real *d_z, + const int ratio, bool overlap); +void timing(const real *d_x, const real *d_y, real *d_z, + const int num); +void timing(const real *h_x, const real *h_y, real *h_z, + real *d_x, real *d_y, real *d_z, + const int num +); + +int main(void) +{ + real *h_x = (real*) malloc(M); + real *h_y = (real*) malloc(M); + real *h_z = (real*) malloc(M); + for (int n = 0; n < N; ++n) + { + h_x[n] = 1.23; + h_y[n] = 2.34; + } + + real *d_x, *d_y, *d_z; + CHECK(cudaMalloc(&d_x, M)); + CHECK(cudaMalloc(&d_y, M)); + CHECK(cudaMalloc(&d_z, M)); + CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice)); + CHECK(cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice)); + + + // host and kernal overlap. + printf("Without CPU-GPU overlap (ratio = 10)\n"); + timing(h_x, h_y, h_z, d_x, d_y, d_z, 10, false); + printf("With CPU-GPU overlap (ratio = 10)\n"); + timing(h_x, h_y, h_z, d_x, d_y, d_z, 10, true); + + printf("Without CPU-GPU overlap (ratio = 1)\n"); + timing(h_x, h_y, h_z, d_x, d_y, d_z, 1, false); + printf("With CPU-GPU overlap (ratio = 1)\n"); + timing(h_x, h_y, h_z, d_x, d_y, d_z, 1, true); + + printf("Without CPU-GPU overlap (ratio = 1000)\n"); + timing(h_x, h_y, h_z, d_x, d_y, d_z, 1000, false); + printf("With CPU-GPU overlap (ratio = 1000)\n"); + timing(h_x, h_y, h_z, d_x, d_y, d_z, 1000, true); + + + // kernal and kernal overlap. + for (int n = 0 ; n < MAX_NUM_STREAMS; ++n) + { + // 创建cuda流。 + CHECK(cudaStreamCreate(&(streams[n]))); + } + + for (int num = 1; num <= MAX_NUM_STREAMS; ++num) + { + timing(d_x, d_y, d_z, num); + } + + for (int n = 0 ; n < MAX_NUM_STREAMS; ++n) + { + // 销毁cuda流。 + CHECK(cudaStreamDestroy(streams[n])); + } + + + // kernal and data transfering overlap. + real *h_x2, *h_y2, *h_z2; + CHECK(cudaMallocHost(&h_x2, M)); + CHECK(cudaMallocHost(&h_y2, M)); + CHECK(cudaMallocHost(&h_z2, M)); + for (int n = 0; n < N; ++n) + { + h_x2[n] = 1.23; + h_y2[n] = 2.34; + } + + for (int i = 0; i < MAX_NUM_STREAMS; i++) + { + CHECK(cudaStreamCreate(&(streams[i]))); + } + + for (int num = 1; num <= MAX_NUM_STREAMS; num *= 2) + { + timing(h_x2, h_y2, h_z2, d_x, d_y, d_z, num); + } + + for (int i = 0 ; i < MAX_NUM_STREAMS; i++) + { + CHECK(cudaStreamDestroy(streams[i])); + } + + CHECK(cudaFreeHost(h_x2)); + CHECK(cudaFreeHost(h_y2)); + CHECK(cudaFreeHost(h_z2)); + + free(h_x); + free(h_y); + free(h_z); + CHECK(cudaFree(d_x)); + CHECK(cudaFree(d_y)); + CHECK(cudaFree(d_z)); + + return 0; +} + +void cpu_sum(const real *x, const real *y, real *z, const int N_host) +{ + for (int n = 0; n < N_host; ++n) + { + z[n] = x[n] + y[n]; + } +} + +void __global__ gpu_sum(const real *x, const real *y, real *z) +{ + const int n = blockDim.x * blockIdx.x + threadIdx.x; + if (n < N) + { + z[n] = x[n] + y[n]; + } +} + +void timing +( + const real *h_x, const real *h_y, real *h_z, + const real *d_x, const real *d_y, real *d_z, + const int ratio, bool overlap +) +{ + float t_sum = 0; + float t2_sum = 0; + + for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat) + { + cudaEvent_t start, stop; + CHECK(cudaEventCreate(&start)); + CHECK(cudaEventCreate(&stop)); + CHECK(cudaEventRecord(start)); + cudaEventQuery(start); + + if (!overlap) + { + cpu_sum(h_x, h_y, h_z, N / ratio); + } + + gpu_sum<<>>(d_x, d_y, d_z); + + if (overlap) + { + // 主机函数与设备核函数重叠。 + cpu_sum(h_x, h_y, h_z, N / ratio); + } + + CHECK(cudaEventRecord(stop)); + CHECK(cudaEventSynchronize(stop)); + float elapsed_time; + CHECK(cudaEventElapsedTime(&elapsed_time, start, stop)); + printf("Time = %g ms.\n", elapsed_time); + + if (repeat > 0) + { + t_sum += elapsed_time; + t2_sum += elapsed_time * elapsed_time; + } + + CHECK(cudaEventDestroy(start)); + CHECK(cudaEventDestroy(stop)); + } + + const float t_ave = t_sum / NUM_REPEATS; + const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave); + printf("Time = %g +- %g ms.\n", t_ave, t_err); +} + +void __global__ add(const real *d_x, const real *d_y, real *d_z) +{ + const int n = blockDim.x * blockIdx.x + threadIdx.x; + if (n < N1) + { + for (int i = 0; i < 100000; ++i) + { + d_z[n] = d_x[n] + d_y[n]; + } + } +} + +void timing(const real *d_x, const real *d_y, real *d_z, const int num) +{ + float t_sum = 0; + float t2_sum = 0; + + for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat) + { + cudaEvent_t start, stop; + CHECK(cudaEventCreate(&start)); + CHECK(cudaEventCreate(&stop)); + CHECK(cudaEventRecord(start)); + cudaEventQuery(start); + + for (int n = 0; n < num; ++n) + { + int offset = n * N1; + + // 指定各个核函数的cuda流,实现核函数的并行。 + add<<>>(d_x + offset, d_y + offset, d_z + offset); + } + + CHECK(cudaEventRecord(stop)); + CHECK(cudaEventSynchronize(stop)); + float elapsed_time; + CHECK(cudaEventElapsedTime(&elapsed_time, start, stop)); + + if (repeat > 0) + { + t_sum += elapsed_time; + t2_sum += elapsed_time * elapsed_time; + } + + CHECK(cudaEventDestroy(start)); + CHECK(cudaEventDestroy(stop)); + } + + const float t_ave = t_sum / NUM_REPEATS; + const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave); + printf("%g\n", t_ave); +} + +void __global__ add2(const real *x, const real *y, real *z, int N) +{ + const int n = blockDim.x * blockIdx.x + threadIdx.x; + if (n < N) + { + for (int i = 0; i < 40; ++i) + { + z[n] = x[n] + y[n]; + } + } +} + +void timing +( + const real *h_x, const real *h_y, real *h_z, + real *d_x, real *d_y, real *d_z, + const int num +) +{ + int N1 = N / num; + int M1 = M / num; + + float t_sum = 0; + float t2_sum = 0; + + for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat) + { + cudaEvent_t start, stop; + CHECK(cudaEventCreate(&start)); + CHECK(cudaEventCreate(&stop)); + CHECK(cudaEventRecord(start)); + cudaEventQuery(start); + + for (int i = 0; i < num; i++) + { + int offset = i * N1; + + // 划分主机不可分页内存,实现异步的数据传输。 + // 每个cuda流都有各自的数据传输操作。 + CHECK(cudaMemcpyAsync(d_x + offset, h_x + offset, M1, + cudaMemcpyHostToDevice, streams[i])); + CHECK(cudaMemcpyAsync(d_y + offset, h_y + offset, M1, + cudaMemcpyHostToDevice, streams[i])); + + int block_size = 128; + int grid_size = (N1 - 1) / block_size + 1; + + // 指定核函数的cuda流。 + add2<<>>(d_x + offset, d_y + offset, d_z + offset, N1); + + CHECK(cudaMemcpyAsync(h_z + offset, d_z + offset, M1, + cudaMemcpyDeviceToHost, streams[i])); + } + + CHECK(cudaEventRecord(stop)); + CHECK(cudaEventSynchronize(stop)); + float elapsed_time; + CHECK(cudaEventElapsedTime(&elapsed_time, start, stop)); + + if (repeat > 0) + { + t_sum += elapsed_time; + t2_sum += elapsed_time * elapsed_time; + } + + CHECK(cudaEventDestroy(start)); + CHECK(cudaEventDestroy(stop)); + } + + const float t_ave = t_sum / NUM_REPEATS; + const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave); + printf("%d %g\n", num, t_ave); +} + +