workding on capter8

2021-11-21 23:09:44 +08:00 · 2021-11-21 23:09:44 +08:00 · 7e38f8ed57
parent e0b202d06d
commit 7e38f8ed57
3 changed files with 129 additions and 5 deletions
--- a/capter8/ReadMe.md
+++ b/capter8/ReadMe.md
@ -1,10 +1,19 @@
 # 共享内存的合理使用

-
-
-
-
-
+共享内存是一种可以被程序员直接操作的缓存，主要作用有两个：  
+
+1. 减少核函数中对全局内存的访问次数，实现高效的线程块内部的通信；
+2. 提高全局内存访问的合并度。
+
+------
+
+## 数组归约
+
+对于多线程程序，默认情况下不同线程的执行顺序是不固定的（线程间独立）。
+
+采用 **折半规约法**，通过线程块对数据分片归约，最后再一并求和。
+
+------



--- a/capter8/reduce.cu
+++ b/capter8/reduce.cu
@ -0,0 +1,99 @@
+#include "../common/error.cuh"
+#include "../common/floats.hpp"
+#include <chrono>
+
+using namespace std::chrono;
+
+
+real reduce_cpu(const real *x, const int N)
+{
+    real sum = 0.0;
+    for (int i = 0; i < N ; ++i)
+    {
+        sum += x[i];
+    }
+
+    return sum;
+}
+
+__global__ void reduce(real *x, real *y)
+{
+    const int tid = threadIdx.x;
+    real *curr_x = x + blockIdx.x * blockDim.x;  // 当前线程块中处理的内存首地址。
+
+    for (int offset = blockDim.x >> 1; offset > 0; offset >>=1)  // 折半归约。
+    {
+        if (tid < offset)
+        {
+            curr_x[tid] += curr_x[tid + offset];
+        }
+
+        // 保证一个线程块中所有线程在执行该语句后面的语句之前，都完全执行了前面的语句。
+        // 实现一个线程块中线程按照代码出现的顺序执行指令。
+        // 但是不同线程块之间依然是独立、异步的。
+        __syncthreads();
+    }
+
+    if (tid == 0)
+    {
+        y[blockIdx.x] = curr_x[0];
+    }
+}
+
+
+int main()
+{
+    int N = 1e8;  // 单精度将发生 “大数吃小数” 的现象，导致结果完全错误；双精度没有问题。
+    int M = N * sizeof(real);
+
+    int block_size = 128;
+    int grid_size = (N + block_size - 1)/block_size; 
+
+    real *h_x = new real[N];
+    real *h_y = new real[grid_size]; 
+    for (int i = 0; i < N; ++i)
+    {
+        h_x[i] = 1.23;
+    }
+
+    cout << FLOAT_PREC << endl;
+
+    auto t1 = system_clock::now();
+
+    cout << "cpu reduce:  " << reduce_cpu(h_x, N) << endl;
+
+    auto t2 = system_clock::now();
+    double time = duration<double, std::milli>(t2 - t1).count();
+    cout << "cpu reduce time cost: " << time << " ms" << endl;
+
+    real *d_x, *d_y;
+    int size = grid_size*sizeof(real);
+    CHECK(cudaMalloc(&d_x, M)); 
+    CHECK(cudaMalloc(&d_y, size)); // 数据分片后个线程块的归约结果数组。
+    CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyDefault)); 
+    
+    cudaEvent_t start, stop;
+    CHECK(cudaEventCreate(&start));
+    CHECK(cudaEventCreate(&stop));
+    CHECK(cudaEventRecord(start));
+    cudaEventQuery(start);
+
+    reduce<<<grid_size, block_size>>>(d_x, d_y);
+    CHECK(cudaMemcpy(h_y, d_y, size, cudaMemcpyDefault));
+    CHECK(cudaGetLastError());
+
+    float elap_time=0, curr_time=0;
+    CHECK(cudaEventRecord(stop));
+    CHECK(cudaEventSynchronize(stop));
+    CHECK(cudaEventElapsedTime(&curr_time, start, stop));
+    printf("gpu reduce time cost: %f ms\n", curr_time - elap_time);
+    elap_time = curr_time;
+
+    delete[] h_x;
+    delete[] h_y;
+    CHECK(cudaFree(d_x));
+    CHECK(cudaFree(d_y));
+
+    return 0;
+}
+
--- a/common/floats.hpp
+++ b/common/floats.hpp
@ -0,0 +1,16 @@
+#pragma once
+#include <iostream>
+
+using std::cout;
+using std::endl;
+
+
+#ifdef USE_DP
+    typedef double real;  
+    const real EPSILON = 1.0e-15;
+    const char *FLOAT_PREC = "------ double precision";
+#else
+    typedef float real;   
+    const real EPSILON = 1.0e-6f;
+    const char *FLOAT_PREC = "------ float precision";
+#endif