CudaSteps/capter11/stream.cu

323 lines
8.2 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "../common/error.cuh"
#include "../common/floats.hpp"
#include <math.h>
#include <stdio.h>
const int NUM_REPEATS = 10;
const int N1 = 1024;
const int MAX_NUM_STREAMS = 30;
const int N2 = N1 * MAX_NUM_STREAMS;
const int M2 = sizeof(real) * N2;
cudaStream_t streams[MAX_NUM_STREAMS]; // cuda流数组全局变量由系统负责销毁。
const int N = 100000000;
const int M = sizeof(real) * N;
const int block_size = 128;
const int grid_size = (N - 1) / block_size + 1;
void timing(const real *h_x, const real *h_y, real *h_z,
const real *d_x, const real *d_y, real *d_z,
const int ratio, bool overlap);
void timing(const real *d_x, const real *d_y, real *d_z,
const int num);
void timing(const real *h_x, const real *h_y, real *h_z,
real *d_x, real *d_y, real *d_z,
const int num
);
int main(void)
{
real *h_x = (real*) malloc(M);
real *h_y = (real*) malloc(M);
real *h_z = (real*) malloc(M);
for (int n = 0; n < N; ++n)
{
h_x[n] = 1.23;
h_y[n] = 2.34;
}
real *d_x, *d_y, *d_z;
CHECK(cudaMalloc(&d_x, M));
CHECK(cudaMalloc(&d_y, M));
CHECK(cudaMalloc(&d_z, M));
CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice));
// host and kernal overlap.
printf("Without CPU-GPU overlap (ratio = 10)\n");
timing(h_x, h_y, h_z, d_x, d_y, d_z, 10, false);
printf("With CPU-GPU overlap (ratio = 10)\n");
timing(h_x, h_y, h_z, d_x, d_y, d_z, 10, true);
printf("Without CPU-GPU overlap (ratio = 1)\n");
timing(h_x, h_y, h_z, d_x, d_y, d_z, 1, false);
printf("With CPU-GPU overlap (ratio = 1)\n");
timing(h_x, h_y, h_z, d_x, d_y, d_z, 1, true);
printf("Without CPU-GPU overlap (ratio = 1000)\n");
timing(h_x, h_y, h_z, d_x, d_y, d_z, 1000, false);
printf("With CPU-GPU overlap (ratio = 1000)\n");
timing(h_x, h_y, h_z, d_x, d_y, d_z, 1000, true);
// kernal and kernal overlap.
for (int n = 0 ; n < MAX_NUM_STREAMS; ++n)
{
// 创建cuda流。
CHECK(cudaStreamCreate(&(streams[n])));
}
for (int num = 1; num <= MAX_NUM_STREAMS; ++num)
{
timing(d_x, d_y, d_z, num);
}
for (int n = 0 ; n < MAX_NUM_STREAMS; ++n)
{
// 销毁cuda流。
CHECK(cudaStreamDestroy(streams[n]));
}
// kernal and data transfering overlap.
real *h_x2, *h_y2, *h_z2;
CHECK(cudaMallocHost(&h_x2, M));
CHECK(cudaMallocHost(&h_y2, M));
CHECK(cudaMallocHost(&h_z2, M));
for (int n = 0; n < N; ++n)
{
h_x2[n] = 1.23;
h_y2[n] = 2.34;
}
for (int i = 0; i < MAX_NUM_STREAMS; i++)
{
CHECK(cudaStreamCreate(&(streams[i])));
}
for (int num = 1; num <= MAX_NUM_STREAMS; num *= 2)
{
timing(h_x2, h_y2, h_z2, d_x, d_y, d_z, num);
}
for (int i = 0 ; i < MAX_NUM_STREAMS; i++)
{
CHECK(cudaStreamDestroy(streams[i]));
}
CHECK(cudaFreeHost(h_x2));
CHECK(cudaFreeHost(h_y2));
CHECK(cudaFreeHost(h_z2));
free(h_x);
free(h_y);
free(h_z);
CHECK(cudaFree(d_x));
CHECK(cudaFree(d_y));
CHECK(cudaFree(d_z));
return 0;
}
void cpu_sum(const real *x, const real *y, real *z, const int N_host)
{
for (int n = 0; n < N_host; ++n)
{
z[n] = x[n] + y[n];
}
}
void __global__ gpu_sum(const real *x, const real *y, real *z)
{
const int n = blockDim.x * blockIdx.x + threadIdx.x;
if (n < N)
{
z[n] = x[n] + y[n];
}
}
void timing
(
const real *h_x, const real *h_y, real *h_z,
const real *d_x, const real *d_y, real *d_z,
const int ratio, bool overlap
)
{
float t_sum = 0;
float t2_sum = 0;
for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat)
{
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
if (!overlap)
{
cpu_sum(h_x, h_y, h_z, N / ratio);
}
gpu_sum<<<grid_size, block_size>>>(d_x, d_y, d_z);
if (overlap)
{
// 主机函数与设备核函数重叠。
cpu_sum(h_x, h_y, h_z, N / ratio);
}
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("Time = %g ms.\n", elapsed_time);
if (repeat > 0)
{
t_sum += elapsed_time;
t2_sum += elapsed_time * elapsed_time;
}
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
}
const float t_ave = t_sum / NUM_REPEATS;
const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave);
printf("Time = %g +- %g ms.\n", t_ave, t_err);
}
void __global__ add(const real *d_x, const real *d_y, real *d_z)
{
const int n = blockDim.x * blockIdx.x + threadIdx.x;
if (n < N1)
{
for (int i = 0; i < 100000; ++i)
{
d_z[n] = d_x[n] + d_y[n];
}
}
}
void timing(const real *d_x, const real *d_y, real *d_z, const int num)
{
float t_sum = 0;
float t2_sum = 0;
for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat)
{
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
for (int n = 0; n < num; ++n)
{
int offset = n * N1;
// 指定各个核函数的cuda流实现核函数的并行。
add<<<grid_size, block_size, 0, streams[n]>>>(d_x + offset, d_y + offset, d_z + offset);
}
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
if (repeat > 0)
{
t_sum += elapsed_time;
t2_sum += elapsed_time * elapsed_time;
}
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
}
const float t_ave = t_sum / NUM_REPEATS;
const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave);
printf("%g\n", t_ave);
}
void __global__ add2(const real *x, const real *y, real *z, int N)
{
const int n = blockDim.x * blockIdx.x + threadIdx.x;
if (n < N)
{
for (int i = 0; i < 40; ++i)
{
z[n] = x[n] + y[n];
}
}
}
void timing
(
const real *h_x, const real *h_y, real *h_z,
real *d_x, real *d_y, real *d_z,
const int num
)
{
int N1 = N / num;
int M1 = M / num;
float t_sum = 0;
float t2_sum = 0;
for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat)
{
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
for (int i = 0; i < num; i++)
{
int offset = i * N1;
// 划分主机不可分页内存,实现异步的数据传输。
// 每个cuda流都有各自的数据传输操作。
CHECK(cudaMemcpyAsync(d_x + offset, h_x + offset, M1,
cudaMemcpyHostToDevice, streams[i]));
CHECK(cudaMemcpyAsync(d_y + offset, h_y + offset, M1,
cudaMemcpyHostToDevice, streams[i]));
int block_size = 128;
int grid_size = (N1 - 1) / block_size + 1;
// 指定核函数的cuda流。
add2<<<grid_size, block_size, 0, streams[i]>>>(d_x + offset, d_y + offset, d_z + offset, N1);
CHECK(cudaMemcpyAsync(h_z + offset, d_z + offset, M1,
cudaMemcpyDeviceToHost, streams[i]));
}
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
if (repeat > 0)
{
t_sum += elapsed_time;
t2_sum += elapsed_time * elapsed_time;
}
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
}
const float t_ave = t_sum / NUM_REPEATS;
const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave);
printf("%d %g\n", num, t_ave);
}