working on capter8

2021-11-22 14:02:56 +08:00 · 2021-11-22 14:02:56 +08:00 · 2c7646637c
parent 7e38f8ed57
commit 2c7646637c
3 changed files with 278 additions and 6 deletions
--- a/capter8/ReadMe.md
+++ b/capter8/ReadMe.md
@ -13,7 +13,39 @@

 采用 **折半规约法**，通过线程块对数据分片归约，最后再一并求和。

+核函数中循环的每一轮都会被拆解、分配到线程块内的所有线程上执行，而不是一个线程连续执行一次完整循环。  
+核函数中代码是 “单指令多线程” ，代码真正的执行顺序与出现顺序可能不同。所以 线程 0、1、... 127之间实际上并行的。
+
+保证一个线程块中所有线程在执行该语句后面的语句之前，都完全执行了前面的语句：通过 `__syncthreads()`   
+实现一个线程块中所有线程按照代码出现的顺序执行指令，但是不同线程块之间依然是独立、异步的。
+
+**共享内存变量**，可以在核函数中通过限定符 `__shared__` 定义一个共享内存变量，  
+这样就相当于在每一个线程块中有一个该变量的副本。虽然每个副本都是独立的，但核函数中对共享变量的操作  
+都将 **同时** 作用在所有副本上。
+
+核函数中可以直接使用函数外部由 `#define` 或 `const` 定义的常量，但在 MSVC 中限制了核函数使用 `const` 定义的常量。
+
+利用共享内存进行线程块之间的合作（通信）之前，都要进行同步，以确保共享内存变量中数据对于所有线程块内的  
+所有线程都是准备好的。 
+
+共享内存的生命周期仅在核函数内，所以必须在核函数结束前将共享内存中需要的结果保存到全局内存。  
+通过共享内存可以避免修改全局内存变量，同时不再要求全局内存数组为 线程块大小的整数倍。  
+
+线程块的共享内存根据申请方式分为：静态共享内存变量和动态共享内存变量。  
+前者在核函数中定义共享内存大小（通过编译期常量），后者在主机调用核函数时指定大小（可以提高可维护性）。
+
 ------

+## 矩阵转置
+
+由于共享内存访问速度快于全局内存，所以可以通过线程块内的共享内存将全局内存的非合并访问转为合并访问。
+
+------
+
+## 共享内存的 bank 冲突
+
+共享内存在物理上被分为32个同样宽度、能被同时访问的内存bank。  


+
+------
--- a/capter8/matrix.cu
+++ b/capter8/matrix.cu
@ -0,0 +1,140 @@
+
+#include "../common/error.cuh"
+#include "../common/floats.hpp"
+
+#define TILE_DIM  32
+
+__constant__ int c_TILE_DIM = 32;  // 设备内存中线程块中矩阵维度（线程块大小，最大1024）。
+
+__global__ void transpose1(const real *src, real *dst, const int N);
+__global__ void transpose2(const real *src, real *dst, const int N);
+__global__ void transpose3(const real *src, real *dst, const int N);
+
+int main()
+{
+    const int N = 128; 
+    const int M = N * N * sizeof(real);
+
+    int SIZE = 0;
+    CHECK(cudaMemcpyFromSymbol(&SIZE, c_TILE_DIM, sizeof(int)));   
+
+    const int grid_size_x = (N + SIZE - 1)/SIZE; // 获取网格大小。
+    const int grid_size_y = grid_size_x;
+
+    const dim3 block_size(SIZE, SIZE);
+    const dim3 grid_size(grid_size_x, grid_size_y);
+
+    real *h_matrix_org, *h_matrix_res;
+    h_matrix_org = new real[N*N];
+    h_matrix_res = new real[N*N];
+    for (int i = 0; i < N; ++i)
+    {
+        for (int j = 0; j < N; ++j)
+        {
+            h_matrix_org[j] = i;
+        }   
+    }
+
+    real *d_matrix_org, *d_matrix_res;
+    CHECK(cudaMalloc(&d_matrix_org, M));
+    CHECK(cudaMalloc(&d_matrix_res, M));
+    CHECK(cudaMemcpy(d_matrix_org, h_matrix_org, M, cudaMemcpyDefault));    
+
+    float elapsed_time = 0;
+    float curr_time = 0;
+    cudaEvent_t start, stop;
+    CHECK(cudaEventCreate(&start));
+    CHECK(cudaEventCreate(&stop));
+    CHECK(cudaEventRecord(start));
+    cudaEventQuery(start);
+
+    // 矩阵转置（全局内存合并读取、非合并写入）。
+    transpose1<<<grid_size, block_size>>>(d_matrix_org, d_matrix_res, N);     
+    CHECK(cudaMemcpy(h_matrix_res, d_matrix_res, M, cudaMemcpyDefault));
+
+    CHECK(cudaEventRecord(stop));
+    CHECK(cudaEventSynchronize(stop));
+    CHECK(cudaEventElapsedTime(&curr_time, start, stop));    
+    printf("matrix transpose1 time cost: %f ms.\n", curr_time - elapsed_time);
+    elapsed_time = curr_time;
+
+    // 矩阵转置（全局内存非合并读取、合并写入）。
+    transpose2<<<grid_size, block_size>>>(d_matrix_org, d_matrix_res, N);     
+    CHECK(cudaMemcpy(h_matrix_res, d_matrix_res, M, cudaMemcpyDefault));
+
+    CHECK(cudaEventRecord(stop));
+    CHECK(cudaEventSynchronize(stop));
+    CHECK(cudaEventElapsedTime(&curr_time, start, stop));    
+    printf("matrix transpose2 time cost: %f ms.\n", curr_time - elapsed_time);
+    elapsed_time = curr_time;
+
+    // 矩阵转置（通过共享内存全局内存合并读写）。
+    transpose3<<<grid_size, block_size>>>(d_matrix_org, d_matrix_res, N);     
+    CHECK(cudaMemcpy(h_matrix_res, d_matrix_res, M, cudaMemcpyDefault));
+
+    CHECK(cudaEventRecord(stop));
+    CHECK(cudaEventSynchronize(stop));
+    CHECK(cudaEventElapsedTime(&curr_time, start, stop));    
+    printf("matrix transpose3 time cost: %f ms.\n", curr_time - elapsed_time);
+    elapsed_time = curr_time;
+
+    delete[] h_matrix_res;
+    delete[] h_matrix_org;
+    CHECK(cudaFree(d_matrix_org));
+    CHECK(cudaFree(d_matrix_res));
+
+    return 0;
+}
+
+
+__global__ void transpose1(const real *src, real *dst, const int N)
+{
+    const int nx = threadIdx.x + blockIdx.x * c_TILE_DIM;
+    const int ny = threadIdx.y + blockIdx.y * c_TILE_DIM;
+
+    if (nx < N && ny < N)
+    {
+        // 矩阵转置（合并读取、非合并写入）。
+        dst[nx*N + ny] = src[ny*N + nx];
+    }
+}
+
+__global__ void transpose2(const real *src, real *dst, const int N)
+{
+    const int nx = threadIdx.x + blockIdx.x * c_TILE_DIM;
+    const int ny = threadIdx.y + blockIdx.y * c_TILE_DIM;
+
+    if (nx < N && ny < N)
+    {
+        // 矩阵转置（非合并读取、合并写入）。
+        dst[ny*N + nx] = __ldg(&src[nx*N + ny]);   // 显示调用 `__ldg()` 函数缓存全局内存。 
+    }
+}
+
+__global__ void transpose3(const real *src, real *dst, const int N)
+{
+    __shared__ real s_mat[TILE_DIM][TILE_DIM];  //二维静态共享内存，存储线程块内的一片矩阵。
+
+    int bx = blockIdx.x * blockDim.x;  // 当前线程块首线程在网格中列索引。
+    int by = blockIdx.y * blockDim.y;  // 当前线程块首线程在网格中行索引。
+
+    int tx = threadIdx.x + bx;  // 当前线程在网格中列索引。
+    int ty = threadIdx.y + by;  // 当前线程在网格中行索引。
+
+    if (tx < N && ty < N)
+    {
+        // 全局内存合并访问，共享内存非合并访问（矩阵转置）。
+        s_mat[threadIdx.y][threadIdx.x] = src[ty * N + tx]; // 全局内存中二维矩阵一维存储。
+    }
+    __syncthreads();
+    
+    // 全局内存合并访问。
+    int tx2 = bx + threadIdx.y; // 索引？？？
+    int ty2 = by + threadIdx.x;
+    if (tx2 < N && ty2 < N)
+    {
+        // 全局内存合并访问，共享内存合并访问。
+        dst[ty2 * N + tx2] = s_mat[threadIdx.x][threadIdx.y];  // 保存转置结果到全局内存。
+    }
+}
+
--- a/capter8/reduce.cu
+++ b/capter8/reduce.cu
@ -4,6 +4,8 @@

 using namespace std::chrono;

+__constant__ int BLOCK_DIM = 128;
+

 real reduce_cpu(const real *x, const int N)
 {
@ -18,25 +20,93 @@ real reduce_cpu(const real *x, const int N)

 __global__ void reduce(real *x, real *y)
 {
+    // 这里执行迭代折半归约计算时，实际上的线程执行过程：
+    // 1. 线程 0-127，offset = N/2, 迭代第一次；
+    // 2. 线程 0-127，offset = N/4, 迭代第二次；
+    // ...
+    // 即，核函数中循环的每一轮都会被拆解、分配到线程块内的所有线程上执行，而不是一个  
+    // 线程连续执行一次完整循环。
    const int tid = threadIdx.x;
    real *curr_x = x + blockIdx.x * blockDim.x;  // 当前线程块中处理的内存首地址。

-    for (int offset = blockDim.x >> 1; offset > 0; offset >>=1)  // 折半归约。
+    for (int offset = blockDim.x >> 1; offset > 0; offset >>=1) // 迭代折半归约。
    {
-        if (tid < offset)
+        // 由于条件筛选，实际导致每轮有效的线程数量减半，即 “线程束的分化”。
+        // 要求数组大小为线程块大小的整数倍。
+        if (tid < offset) 
        {
+            // 核函数中代码是 “单指令多线程” ，代码真正的执行顺序与出现顺序可能不同。
+            // 所以 线程 0、1、... 127之间实际上并行的。
            curr_x[tid] += curr_x[tid + offset];
        }

        // 保证一个线程块中所有线程在执行该语句后面的语句之前，都完全执行了前面的语句。
-        // 实现一个线程块中线程按照代码出现的顺序执行指令。
+        // 实现一个线程块中所有线程按照代码出现的顺序执行指令，即线程1等待线程0，如此。
        // 但是不同线程块之间依然是独立、异步的。
        __syncthreads();
    }

    if (tid == 0)
    {
-        y[blockIdx.x] = curr_x[0];
+        // 通过线程块内同步，线程块 0 中的归约顺序：
+        // 第一轮，curr_x[0] += curr_x[0+64], ... curr_x[63] += curr_x[63+64]；
+        // 第二轮，curr_x[0] += curr_x[0+32], ... curr_x[31] += curr_x[31+32]；
+        // 第三轮，curr_x[0] += curr_x[0+16], ... curr_x[15] += curr_x[15+16]； 
+        // 第四轮，curr_x[0] += curr_x[0+ 8], ... curr_x[7 ] += curr_x[7 + 8]；  
+        // 第五轮，curr_x[0] += curr_x[0+ 4], ... curr_x[3 ] += curr_x[3 + 4]； 
+        // 第六轮，curr_x[0] += curr_x[0+ 2], curr_x[1 ] += curr_x[1 + 2]； 
+        // 第七轮，curr_x[0] += curr_x[0+ 1]；           
+        y[blockIdx.x] = curr_x[0];  
+    }
+}
+
+__global__ void reduce_shared(const real *x, real *y, const int N)
+{
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    const int ind = bid * blockDim.x + tid;
+
+    __shared__ real s_x[128];  // 定义线程块静态共享内存变量。 
+    s_x[tid] = (ind < N) ? x[ind] : 0.0;  // 拷贝全局内存变量到线程块内的共享内存数据副本。
+    __syncthreads();   // 同步线程块的数据拷贝操作，保证各线程块中数据对于块内线程都准备好。
+
+    for (int offset = blockDim.x>>1; offset > 0; offset>>=1)
+    {
+        if (ind < offset)
+        {
+            s_x[tid] += s_x[tid + offset];
+        }
+        __syncthreads(); // 线程块内线程同步。
+    }
+
+    if (tid == 0)
+    {
+        y[bid] = s_x[0]; // 保存各个线程块中共享内存的0元素到全局内存。
+    }
+}
+
+__global__ void reduce_shared2(const real *x, real *y, const int N)
+{
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    const int ind = bid * blockDim.x + tid;
+
+    extern __shared__ real s_x[];  // 定义线程块动态共享内存变量，内存大小由主机调用核函数时定义。 
+    s_x[tid] = (ind < N) ? x[ind] : 0.0;  // 拷贝全局内存变量到线程块内的共享内存数据副本。
+    __syncthreads();   // 同步线程块的数据拷贝操作，保证各线程块中数据对于块内线程都准备好。
+
+    for (int offset = blockDim.x>>1; offset > 0; offset>>=1)
+    {
+        if (ind < offset)
+        {
+            s_x[tid] += s_x[tid + offset];
+        }
+        __syncthreads(); // 线程块内线程同步。
+    }
+
+    if (tid == 0)
+    {
+        y[bid] = s_x[0]; // 保存各个线程块中共享内存的0元素到全局内存。
    }
 }

@ -46,7 +116,8 @@ int main()
    int N = 1e8;  // 单精度将发生 “大数吃小数” 的现象，导致结果完全错误；双精度没有问题。
    int M = N * sizeof(real);

-    int block_size = 128;
+    int block_size = 0;
+    CHECK(cudaMemcpyFromSymbol(&block_size, BLOCK_DIM, sizeof(real)));
    int grid_size = (N + block_size - 1)/block_size; 

    real *h_x = new real[N];
@ -60,6 +131,7 @@ int main()

    auto t1 = system_clock::now();

+    // cpu归约，单精度下计算错误，大数吃小数。
    cout << "cpu reduce:  " << reduce_cpu(h_x, N) << endl;

    auto t2 = system_clock::now();
@ -78,7 +150,8 @@ int main()
    CHECK(cudaEventRecord(start));
    cudaEventQuery(start);

-    reduce<<<grid_size, block_size>>>(d_x, d_y);
+    // gpu归约，单精度下也能控制误差，稳健性更强。
+    reduce<<<grid_size, block_size>>>(d_x, d_y); 
    CHECK(cudaMemcpy(h_y, d_y, size, cudaMemcpyDefault));
    CHECK(cudaGetLastError());

@ -86,9 +159,36 @@ int main()
    CHECK(cudaEventRecord(stop));
    CHECK(cudaEventSynchronize(stop));
    CHECK(cudaEventElapsedTime(&curr_time, start, stop));
+    cout << "gpu reduce: " << reduce_cpu(h_y, grid_size) << endl;
    printf("gpu reduce time cost: %f ms\n", curr_time - elap_time);
    elap_time = curr_time;

+    // gpu归约，采用静态共享内存的加速。
+    reduce_shared<<<grid_size, block_size>>>(d_x, d_y, N); 
+    CHECK(cudaMemcpy(h_y, d_y, size, cudaMemcpyDefault));
+    CHECK(cudaGetLastError());
+
+    CHECK(cudaEventRecord(stop));
+    CHECK(cudaEventSynchronize(stop));
+    CHECK(cudaEventElapsedTime(&curr_time, start, stop));
+    cout << "gpu shared reduce: " << reduce_cpu(h_y, grid_size) << endl;
+    printf("gpu shared reduce time cost: %f ms\n", curr_time - elap_time);
+    elap_time = curr_time;    
+
+    // gpu归约，采用动态共享内存的加速。
+    // <<<grid_size, block_size, sharedMemSize>>>，第三个参数指定动态共享内存大小。
+    int sharedMemSize = block_size * sizeof(real);  // 核函数中每个线程块的动态共享内存大小。
+    reduce_shared2<<<grid_size, block_size, sharedMemSize>>>(d_x, d_y, N); 
+    CHECK(cudaMemcpy(h_y, d_y, size, cudaMemcpyDefault));
+    CHECK(cudaGetLastError());
+
+    CHECK(cudaEventRecord(stop));
+    CHECK(cudaEventSynchronize(stop));
+    CHECK(cudaEventElapsedTime(&curr_time, start, stop));
+    cout << "gpu shared2 reduce: " << reduce_cpu(h_y, grid_size) << endl;
+    printf("gpu shared2 reduce time cost: %f ms\n", curr_time - elap_time);
+    elap_time = curr_time; 
+
    delete[] h_x;
    delete[] h_y;
    CHECK(cudaFree(d_x));