add capter8

2021-11-22 22:55:37 +08:00 · 2021-11-22 22:55:37 +08:00 · cecb5af32e
parent 2c7646637c
commit cecb5af32e
2 changed files with 106 additions and 11 deletions
--- a/capter8/ReadMe.md
+++ b/capter8/ReadMe.md
@ -40,12 +40,32 @@

 由于共享内存访问速度快于全局内存，所以可以通过线程块内的共享内存将全局内存的非合并访问转为合并访问。

+**注意转置后的数组索引变换**。
+
 ------

 ## 共享内存的 bank 冲突

-共享内存在物理上被分为32个同样宽度、能被同时访问的内存bank。  
+共享内存在物理上被分为32个同样宽度（开普勒架构为 8 字节，其他为 4 字节）、能被同时访问的列向内存bank。  
+
+======================================
+bank0     bank1    ...    bank31
+======================================
+layer1    layer1   ...    layer1
+layer2    layer2   ...    layer2
+...
+layer32   layer32   ...   layer32


+只要同一个线程束内的多个线程不同时访问同一个 bank 中不同层的数据，该线程束对共享内存的访问就只需要  
+一次内存事务。当同一个线程束内的多个线程试图访问同一个 bank 中不同层的数据时，就会发生冲突。  
+在同一线程束中的多个线程对同一个 bank 中的 n 层数据访问将导致 n 次内存事务，  
+称为发生了 **n 路 bank 冲突**。
+
+当线程束内的32个线程同时访问同一个 bank 的32个不同层，这将导致 32 路 bank 冲突。对于非开普勒架构，  
+每个共享内存的宽带为 4 字节；于是每一层的32个 bank 将对应 32 个 float 数组元素。
+
+使用共享内存来改善全局内存的访问方式不一定会提高核函数的性能；不要过早优化，在优化程序时要对不同的  
+优化方案进行测试和比较。

 ------
--- a/capter8/matrix.cu
+++ b/capter8/matrix.cu
@ -1,18 +1,27 @@
-
 #include "../common/error.cuh"
 #include "../common/floats.hpp"
+#include <iomanip>
+#include <string>
+#include <fstream>

 #define TILE_DIM  32

 __constant__ int c_TILE_DIM = 32;  // 设备内存中线程块中矩阵维度（线程块大小，最大1024）。

+void show(const real *matrix, const int N, std::string outfile, std::string title);
 __global__ void transpose1(const real *src, real *dst, const int N);
 __global__ void transpose2(const real *src, real *dst, const int N);
 __global__ void transpose3(const real *src, real *dst, const int N);
+__global__ void transpose4(const real *src, real *dst, const int N);
+
+

 int main()
 {
-    const int N = 128; 
+    // 由于显存 2 GB，float 为 4 字节，double 为 8 字节，所以在 transpose3, transpose4中：
+    // float 矩阵维度不能超过 726；
+    // double 矩阵维度不能超过 342；
+    const int N = 300;  
    const int M = N * N * sizeof(real);

    int SIZE = 0;
@ -31,9 +40,10 @@ int main()
    {
        for (int j = 0; j < N; ++j)
        {
-            h_matrix_org[j] = i;
+            h_matrix_org[i * N + j] = i*1.0e-2;
        }   
    }
+    // show(h_matrix_org, N, "result.txt", "origin matrix");
    
    real *d_matrix_org, *d_matrix_res;
    CHECK(cudaMalloc(&d_matrix_org, M));
@ -51,6 +61,7 @@ int main()
    // 矩阵转置（全局内存合并读取、非合并写入）。
    transpose1<<<grid_size, block_size>>>(d_matrix_org, d_matrix_res, N);     
    CHECK(cudaMemcpy(h_matrix_res, d_matrix_res, M, cudaMemcpyDefault));
+    // show(h_matrix_res, N, "result.txt", "transpose1");

    CHECK(cudaEventRecord(stop));
    CHECK(cudaEventSynchronize(stop));
@ -61,6 +72,7 @@ int main()
    // 矩阵转置（全局内存非合并读取、合并写入）。
    transpose2<<<grid_size, block_size>>>(d_matrix_org, d_matrix_res, N);     
    CHECK(cudaMemcpy(h_matrix_res, d_matrix_res, M, cudaMemcpyDefault));
+    // show(h_matrix_res, N, "matrix.txt", "transpose2");

    CHECK(cudaEventRecord(stop));
    CHECK(cudaEventSynchronize(stop));
@ -71,6 +83,7 @@ int main()
    // 矩阵转置（通过共享内存全局内存合并读写）。
    transpose3<<<grid_size, block_size>>>(d_matrix_org, d_matrix_res, N);     
    CHECK(cudaMemcpy(h_matrix_res, d_matrix_res, M, cudaMemcpyDefault));
+    // show(h_matrix_res, N, "result.txt", "transpose3");

    CHECK(cudaEventRecord(stop));
    CHECK(cudaEventSynchronize(stop));
@ -78,6 +91,17 @@ int main()
    printf("matrix transpose3 time cost: %f ms.\n", curr_time - elapsed_time);
    elapsed_time = curr_time;

+    // 矩阵转置（通过共享内存、bank处理，实现全局内存合并读写）。
+    transpose4<<<grid_size, block_size>>>(d_matrix_org, d_matrix_res, N);     
+    CHECK(cudaMemcpy(h_matrix_res, d_matrix_res, M, cudaMemcpyDefault));
+    // show(h_matrix_res, N, "result.txt", "transpose3");
+
+    CHECK(cudaEventRecord(stop));
+    CHECK(cudaEventSynchronize(stop));
+    CHECK(cudaEventElapsedTime(&curr_time, start, stop));    
+    printf("matrix transpose4 time cost: %f ms.\n", curr_time - elapsed_time);
+    elapsed_time = curr_time;    
+
    delete[] h_matrix_res;
    delete[] h_matrix_org;
    CHECK(cudaFree(d_matrix_org));
@ -87,6 +111,27 @@ int main()
 }


+void show(const real *x, const int N, std::string outfile, std::string title)
+{
+    std::fstream out(outfile, std::ios::app);
+    if (!out.is_open())
+    {
+        std::cerr << "invalid output file: " << outfile << endl;
+        return;
+    }
+
+    out << "\n\n----------------" << title << endl;
+
+    for (int i = 0; i < N; ++i)
+    {
+        out << endl;
+        for (int j = 0; j < N; ++j)
+        {
+            out << std::setw(6) << x[i * N + j];
+        }
+    }
+}
+
 __global__ void transpose1(const real *src, real *dst, const int N)
 {
    const int nx = threadIdx.x + blockIdx.x * c_TILE_DIM;
@ -113,6 +158,11 @@ __global__ void transpose2(const real *src, real *dst, const int N)

 __global__ void transpose3(const real *src, real *dst, const int N)
 {
+    // 正常的做法中，全局内存的读写必有一个是非合并访问。
+    // 现在通过将非合并访问转移到共享内存，利用共享内存的高性能（100倍全局内存），提高计算速度：  
+    // 1. 首先将全局内存拷贝到线程块的共享内存；
+    // 2. 然后从共享内存非合并访问，读取数据，合并写入全局内存。
+
    __shared__ real s_mat[TILE_DIM][TILE_DIM];  //二维静态共享内存，存储线程块内的一片矩阵。

    int bx = blockIdx.x * blockDim.x;  // 当前线程块首线程在网格中列索引。
@ -123,18 +173,43 @@ __global__ void transpose3(const real *src, real *dst, const int N)

    if (tx < N && ty < N)
    {
-        // 全局内存合并访问，共享内存非合并访问（矩阵转置）。
+        // 全局内存合并访问，共享内存合并访问。
        s_mat[threadIdx.y][threadIdx.x] = src[ty * N + tx]; // 全局内存中二维矩阵一维存储。
    }
    __syncthreads();
    
    // 全局内存合并访问。
-    int tx2 = bx + threadIdx.y; // 索引？？？
-    int ty2 = by + threadIdx.x;
-    if (tx2 < N && ty2 < N)
+    if (tx < N && ty < N)
    {
-        // 全局内存合并访问，共享内存合并访问。
-        dst[ty2 * N + tx2] = s_mat[threadIdx.x][threadIdx.y];  // 保存转置结果到全局内存。
+        // 局部矩阵转置和全局内存合并写入。
+        int x = by + threadIdx.x;
+        int y = bx + threadIdx.y;
+        dst[y * N + x] = s_mat[threadIdx.x][threadIdx.y]; 
    }
 }

+__global__ void transpose4(const real *src, real *dst, const int N)
+{
+    // 通过修改数组行大小，错开数组元素在共享内存bank中的分布，
+    // 避免线程束的 32路bank冲突。
+    __shared__ real s_mat[TILE_DIM][TILE_DIM + 1];  
+
+    int bx = blockIdx.x * blockDim.x; 
+    int by = blockIdx.y * blockDim.y; 
+
+    int tx = threadIdx.x + bx;  
+    int ty = threadIdx.y + by;  
+
+    if (tx < N && ty < N)
+    {
+        s_mat[threadIdx.y][threadIdx.x] = src[ty * N + tx]; 
+    }
+    __syncthreads();
+    
+    if (tx < N && ty < N)
+    {
+        int x = by + threadIdx.x;
+        int y = bx + threadIdx.y;
+        dst[y * N + x] = s_mat[threadIdx.x][threadIdx.y]; 
+    }
+}