CudaSteps/capter8/matrix.cu

215 lines
7.3 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "../common/error.cuh"
#include "../common/floats.hpp"
#include <iomanip>
#include <string>
#include <fstream>
#define TILE_DIM 32
__constant__ int c_TILE_DIM = 32; // 设备内存中线程块中矩阵维度线程块大小最大1024
void show(const real *matrix, const int N, std::string outfile, std::string title);
__global__ void transpose1(const real *src, real *dst, const int N);
__global__ void transpose2(const real *src, real *dst, const int N);
__global__ void transpose3(const real *src, real *dst, const int N);
__global__ void transpose4(const real *src, real *dst, const int N);
int main()
{
// 由于显存 2 GBfloat 为 4 字节double 为 8 字节,所以在 transpose3, transpose4中
// float 矩阵维度不能超过 726
// double 矩阵维度不能超过 512
const int N = 500;
const int M = N * N * sizeof(real);
int SIZE = 0;
CHECK(cudaMemcpyFromSymbol(&SIZE, c_TILE_DIM, sizeof(int)));
const int grid_size_x = (N + SIZE - 1)/SIZE; // 获取网格大小。
const int grid_size_y = grid_size_x;
const dim3 block_size(SIZE, SIZE);
const dim3 grid_size(grid_size_x, grid_size_y);
real *h_matrix_org, *h_matrix_res;
h_matrix_org = new real[N*N];
h_matrix_res = new real[N*N];
for (int i = 0; i < N; ++i)
{
for (int j = 0; j < N; ++j)
{
h_matrix_org[i * N + j] = i*1.0e-2;
}
}
// show(h_matrix_org, N, "result.txt", "origin matrix");
real *d_matrix_org, *d_matrix_res;
CHECK(cudaMalloc(&d_matrix_org, M));
CHECK(cudaMalloc(&d_matrix_res, M));
CHECK(cudaMemcpy(d_matrix_org, h_matrix_org, M, cudaMemcpyDefault));
float elapsed_time = 0;
float curr_time = 0;
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
// 矩阵转置(全局内存合并读取、非合并写入)。
transpose1<<<grid_size, block_size>>>(d_matrix_org, d_matrix_res, N);
CHECK(cudaMemcpy(h_matrix_res, d_matrix_res, M, cudaMemcpyDefault));
// show(h_matrix_res, N, "result.txt", "transpose1");
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
CHECK(cudaEventElapsedTime(&curr_time, start, stop));
printf("matrix transpose1 time cost: %f ms.\n", curr_time - elapsed_time);
elapsed_time = curr_time;
// 矩阵转置(全局内存非合并读取、合并写入)。
transpose2<<<grid_size, block_size>>>(d_matrix_org, d_matrix_res, N);
CHECK(cudaMemcpy(h_matrix_res, d_matrix_res, M, cudaMemcpyDefault));
// show(h_matrix_res, N, "matrix.txt", "transpose2");
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
CHECK(cudaEventElapsedTime(&curr_time, start, stop));
printf("matrix transpose2 time cost: %f ms.\n", curr_time - elapsed_time);
elapsed_time = curr_time;
// 矩阵转置(通过共享内存全局内存合并读写)。
transpose3<<<grid_size, block_size>>>(d_matrix_org, d_matrix_res, N);
CHECK(cudaMemcpy(h_matrix_res, d_matrix_res, M, cudaMemcpyDefault));
// show(h_matrix_res, N, "result.txt", "transpose3");
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
CHECK(cudaEventElapsedTime(&curr_time, start, stop));
printf("matrix transpose3 time cost: %f ms.\n", curr_time - elapsed_time);
elapsed_time = curr_time;
// 矩阵转置通过共享内存、bank处理实现全局内存合并读写
transpose4<<<grid_size, block_size>>>(d_matrix_org, d_matrix_res, N);
CHECK(cudaMemcpy(h_matrix_res, d_matrix_res, M, cudaMemcpyDefault));
// show(h_matrix_res, N, "result.txt", "transpose3");
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
CHECK(cudaEventElapsedTime(&curr_time, start, stop));
printf("matrix transpose4 time cost: %f ms.\n", curr_time - elapsed_time);
elapsed_time = curr_time;
delete[] h_matrix_res;
delete[] h_matrix_org;
CHECK(cudaFree(d_matrix_org));
CHECK(cudaFree(d_matrix_res));
return 0;
}
void show(const real *x, const int N, std::string outfile, std::string title)
{
std::fstream out(outfile, std::ios::app);
if (!out.is_open())
{
std::cerr << "invalid output file: " << outfile << endl;
return;
}
out << "\n\n----------------" << title << endl;
for (int i = 0; i < N; ++i)
{
out << endl;
for (int j = 0; j < N; ++j)
{
out << std::setw(6) << x[i * N + j];
}
}
}
__global__ void transpose1(const real *src, real *dst, const int N)
{
const int nx = threadIdx.x + blockIdx.x * c_TILE_DIM;
const int ny = threadIdx.y + blockIdx.y * c_TILE_DIM;
if (nx < N && ny < N)
{
// 矩阵转置(合并读取、非合并写入)。
dst[nx*N + ny] = src[ny*N + nx];
}
}
__global__ void transpose2(const real *src, real *dst, const int N)
{
const int nx = threadIdx.x + blockIdx.x * c_TILE_DIM;
const int ny = threadIdx.y + blockIdx.y * c_TILE_DIM;
if (nx < N && ny < N)
{
// 矩阵转置(非合并读取、合并写入)。
dst[ny*N + nx] = __ldg(&src[nx*N + ny]); // 显示调用 `__ldg()` 函数缓存全局内存。
}
}
__global__ void transpose3(const real *src, real *dst, const int N)
{
// 正常的做法中,全局内存的读写必有一个是非合并访问。
// 现在通过将非合并访问转移到共享内存利用共享内存的高性能100倍全局内存提高计算速度
// 1. 首先将全局内存拷贝到线程块的共享内存;
// 2. 然后从共享内存非合并访问,读取数据,合并写入全局内存。
__shared__ real s_mat[TILE_DIM][TILE_DIM]; //二维静态共享内存,存储线程块内的一片矩阵。
int bx = blockIdx.x * blockDim.x; // 当前线程块首线程在网格中列索引。
int by = blockIdx.y * blockDim.y; // 当前线程块首线程在网格中行索引。
int tx = threadIdx.x + bx; // 当前线程在网格中列索引。
int ty = threadIdx.y + by; // 当前线程在网格中行索引。
if (tx < N && ty < N)
{
// 全局内存合并访问,共享内存合并访问。
s_mat[threadIdx.y][threadIdx.x] = src[ty * N + tx]; // 全局内存中二维矩阵一维存储。
}
__syncthreads();
// 全局内存合并访问。
if (tx < N && ty < N)
{
// 局部矩阵转置和全局内存合并写入。
int x = by + threadIdx.x;
int y = bx + threadIdx.y;
dst[y * N + x] = s_mat[threadIdx.x][threadIdx.y];
}
}
__global__ void transpose4(const real *src, real *dst, const int N)
{
// 通过修改数组行大小错开数组元素在共享内存bank中的分布
// 避免线程束的 32路bank冲突。
__shared__ real s_mat[TILE_DIM][TILE_DIM + 1];
int bx = blockIdx.x * blockDim.x;
int by = blockIdx.y * blockDim.y;
int tx = threadIdx.x + bx;
int ty = threadIdx.y + by;
if (tx < N && ty < N)
{
s_mat[threadIdx.y][threadIdx.x] = src[ty * N + tx];
}
__syncthreads();
if (tx < N && ty < N)
{
int x = by + threadIdx.x;
int y = bx + threadIdx.y;
dst[y * N + x] = s_mat[threadIdx.x][threadIdx.y];
}
}