working on capter9

This commit is contained in:
unknown 2021-11-25 22:52:47 +08:00
parent ecf8faf14a
commit d71e31cf80
2 changed files with 153 additions and 0 deletions

19
capter9/ReadMe.md Normal file
View File

@ -0,0 +1,19 @@
# 原子函数的合理使用
cuda 中,一个线程的原子操作可以在不受其他线程的任何操作的影响下完成对某个(全局内存或共享内存)
数据的一套“读-改-写”操作。
------
## 完全在 GPU 中进行归约
有两种方法能够在GPU中得到最终结果
1. 用另一个核函数将较短的数组进一步归约;
2. 在核函数末尾利用原子函数进行归约。
------

134
capter9/reduce.cu Normal file
View File

@ -0,0 +1,134 @@
#include "../common/error.cuh"
#include "../common/floats.hpp"
#include "../common/clock.cuh"
__global__ void reduce(real *x, real *y, const int N)
{
int tid = threadIdx.x;
int ind = tid + blockIdx.x * blockDim.x;
extern __shared__ real curr_x[];
curr_x[tid] = (ind < N) ? x[ind] : 0.0;
for (int offset = blockDim.x/2 ; offset > 0 ; offset /= 2)
{
if (tid < offset)
{
curr_x[tid] += curr_x[tid + offset];
}
__syncthreads();
}
if (tid == 0)
{
y[blockIdx.x] = curr_x[0];
}
}
__global__ void reduce2(real *x, real *y, const int N)
{
int tid = threadIdx.x;
int ind = tid + blockIdx.x * blockDim.x;
extern __shared__ real curr_x[];
curr_x[tid] = (ind < N) ? x[ind] : 0.0;
for (int offset = blockDim.x/2 ; offset > 0 ; offset /= 2)
{
if (tid < offset)
{
curr_x[tid] += curr_x[tid + offset];
}
__syncthreads();
}
if (tid == 0)
{
// 原子函数 atomicAdd(*address, val).
atomicAdd(y, curr_x[0]);
}
}
int main()
{
int N = 1e8;
int M = N * sizeof(real);
int bSize = 32;
int gSize = (N + bSize - 1)/bSize;
cout << FLOAT_PREC << endl;
real *h_x, *h_y;
h_x = new real[N];
h_y = new real[gSize];
for (int i = 0; i < N; ++i)
{
h_x[i] = 1.23;
}
cudaClockStart
real *d_x, *d_y;
CHECK(cudaMalloc(&d_x, M));
CHECK(cudaMalloc(&d_y, gSize*sizeof(real)));
CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyDefault));
cudaClockCurr
reduce<<<gSize, bSize, (bSize+1)*sizeof(real)>>>(d_x, d_y, N);
CHECK(cudaMemcpy(h_y, d_y, gSize*sizeof(real), cudaMemcpyDefault));
real res = 0;
for(int i = 0; i < gSize; ++i)
{
res += h_y[i];
}
cout << "reduce result: " << res << endl;
cudaClockCurr
reduce<<<gSize, bSize, (bSize)*sizeof(real)>>>(d_x, d_y, N);
CHECK(cudaMemcpy(h_y, d_y, gSize*sizeof(real), cudaMemcpyDefault));
res = 0.0;
for(int i = 0; i < gSize; ++i)
{
res += h_y[i];
}
cout << "reduce result: " << res << endl;
cudaClockCurr
real *d_y2, *h_y2;
h_y2 = new real(0.0);
CHECK(cudaMalloc(&d_y2, sizeof(real)));
// 采用原子函数、共享内存的核函数归约,
// 由于减少了主机和设备间的数据传输,效率得以提高。
reduce2<<<gSize, bSize, (bSize)*sizeof(real)>>>(d_x, d_y2, N);
CHECK(cudaMemcpy(h_y2, d_y2, sizeof(real), cudaMemcpyDefault));
cout << "reduce2 result: " << *h_y2 << endl;
cudaClockCurr
delete[] h_x;
delete[] h_y;
delete h_y2;
CHECK(cudaFree(d_x));
CHECK(cudaFree(d_y));
CHECK(cudaFree(d_y2));
return 0;
}