215 lines
7.3 KiB
Plaintext
215 lines
7.3 KiB
Plaintext
#include "../common/error.cuh"
|
||
#include "../common/floats.hpp"
|
||
#include <iomanip>
|
||
#include <string>
|
||
#include <fstream>
|
||
|
||
#define TILE_DIM 32
|
||
|
||
__constant__ int c_TILE_DIM = 32; // 设备内存中线程块中矩阵维度(线程块大小,最大1024)。
|
||
|
||
void show(const real *matrix, const int N, std::string outfile, std::string title);
|
||
__global__ void transpose1(const real *src, real *dst, const int N);
|
||
__global__ void transpose2(const real *src, real *dst, const int N);
|
||
__global__ void transpose3(const real *src, real *dst, const int N);
|
||
__global__ void transpose4(const real *src, real *dst, const int N);
|
||
|
||
|
||
|
||
int main()
|
||
{
|
||
// 由于显存 2 GB,float 为 4 字节,double 为 8 字节,所以在 transpose3, transpose4中:
|
||
// float 矩阵维度不能超过 726;
|
||
// double 矩阵维度不能超过 512;
|
||
const int N = 500;
|
||
const int M = N * N * sizeof(real);
|
||
|
||
int SIZE = 0;
|
||
CHECK(cudaMemcpyFromSymbol(&SIZE, c_TILE_DIM, sizeof(int)));
|
||
|
||
const int grid_size_x = (N + SIZE - 1)/SIZE; // 获取网格大小。
|
||
const int grid_size_y = grid_size_x;
|
||
|
||
const dim3 block_size(SIZE, SIZE);
|
||
const dim3 grid_size(grid_size_x, grid_size_y);
|
||
|
||
real *h_matrix_org, *h_matrix_res;
|
||
h_matrix_org = new real[N*N];
|
||
h_matrix_res = new real[N*N];
|
||
for (int i = 0; i < N; ++i)
|
||
{
|
||
for (int j = 0; j < N; ++j)
|
||
{
|
||
h_matrix_org[i * N + j] = i*1.0e-2;
|
||
}
|
||
}
|
||
// show(h_matrix_org, N, "result.txt", "origin matrix");
|
||
|
||
real *d_matrix_org, *d_matrix_res;
|
||
CHECK(cudaMalloc(&d_matrix_org, M));
|
||
CHECK(cudaMalloc(&d_matrix_res, M));
|
||
CHECK(cudaMemcpy(d_matrix_org, h_matrix_org, M, cudaMemcpyDefault));
|
||
|
||
float elapsed_time = 0;
|
||
float curr_time = 0;
|
||
cudaEvent_t start, stop;
|
||
CHECK(cudaEventCreate(&start));
|
||
CHECK(cudaEventCreate(&stop));
|
||
CHECK(cudaEventRecord(start));
|
||
cudaEventQuery(start);
|
||
|
||
// 矩阵转置(全局内存合并读取、非合并写入)。
|
||
transpose1<<<grid_size, block_size>>>(d_matrix_org, d_matrix_res, N);
|
||
CHECK(cudaMemcpy(h_matrix_res, d_matrix_res, M, cudaMemcpyDefault));
|
||
// show(h_matrix_res, N, "result.txt", "transpose1");
|
||
|
||
CHECK(cudaEventRecord(stop));
|
||
CHECK(cudaEventSynchronize(stop));
|
||
CHECK(cudaEventElapsedTime(&curr_time, start, stop));
|
||
printf("matrix transpose1 time cost: %f ms.\n", curr_time - elapsed_time);
|
||
elapsed_time = curr_time;
|
||
|
||
// 矩阵转置(全局内存非合并读取、合并写入)。
|
||
transpose2<<<grid_size, block_size>>>(d_matrix_org, d_matrix_res, N);
|
||
CHECK(cudaMemcpy(h_matrix_res, d_matrix_res, M, cudaMemcpyDefault));
|
||
// show(h_matrix_res, N, "matrix.txt", "transpose2");
|
||
|
||
CHECK(cudaEventRecord(stop));
|
||
CHECK(cudaEventSynchronize(stop));
|
||
CHECK(cudaEventElapsedTime(&curr_time, start, stop));
|
||
printf("matrix transpose2 time cost: %f ms.\n", curr_time - elapsed_time);
|
||
elapsed_time = curr_time;
|
||
|
||
// 矩阵转置(通过共享内存全局内存合并读写)。
|
||
transpose3<<<grid_size, block_size>>>(d_matrix_org, d_matrix_res, N);
|
||
CHECK(cudaMemcpy(h_matrix_res, d_matrix_res, M, cudaMemcpyDefault));
|
||
// show(h_matrix_res, N, "result.txt", "transpose3");
|
||
|
||
CHECK(cudaEventRecord(stop));
|
||
CHECK(cudaEventSynchronize(stop));
|
||
CHECK(cudaEventElapsedTime(&curr_time, start, stop));
|
||
printf("matrix transpose3 time cost: %f ms.\n", curr_time - elapsed_time);
|
||
elapsed_time = curr_time;
|
||
|
||
// 矩阵转置(通过共享内存、bank处理,实现全局内存合并读写)。
|
||
transpose4<<<grid_size, block_size>>>(d_matrix_org, d_matrix_res, N);
|
||
CHECK(cudaMemcpy(h_matrix_res, d_matrix_res, M, cudaMemcpyDefault));
|
||
// show(h_matrix_res, N, "result.txt", "transpose3");
|
||
|
||
CHECK(cudaEventRecord(stop));
|
||
CHECK(cudaEventSynchronize(stop));
|
||
CHECK(cudaEventElapsedTime(&curr_time, start, stop));
|
||
printf("matrix transpose4 time cost: %f ms.\n", curr_time - elapsed_time);
|
||
elapsed_time = curr_time;
|
||
|
||
delete[] h_matrix_res;
|
||
delete[] h_matrix_org;
|
||
CHECK(cudaFree(d_matrix_org));
|
||
CHECK(cudaFree(d_matrix_res));
|
||
|
||
return 0;
|
||
}
|
||
|
||
|
||
void show(const real *x, const int N, std::string outfile, std::string title)
|
||
{
|
||
std::fstream out(outfile, std::ios::app);
|
||
if (!out.is_open())
|
||
{
|
||
std::cerr << "invalid output file: " << outfile << endl;
|
||
return;
|
||
}
|
||
|
||
out << "\n\n----------------" << title << endl;
|
||
|
||
for (int i = 0; i < N; ++i)
|
||
{
|
||
out << endl;
|
||
for (int j = 0; j < N; ++j)
|
||
{
|
||
out << std::setw(6) << x[i * N + j];
|
||
}
|
||
}
|
||
}
|
||
|
||
__global__ void transpose1(const real *src, real *dst, const int N)
|
||
{
|
||
const int nx = threadIdx.x + blockIdx.x * c_TILE_DIM;
|
||
const int ny = threadIdx.y + blockIdx.y * c_TILE_DIM;
|
||
|
||
if (nx < N && ny < N)
|
||
{
|
||
// 矩阵转置(合并读取、非合并写入)。
|
||
dst[nx*N + ny] = src[ny*N + nx];
|
||
}
|
||
}
|
||
|
||
__global__ void transpose2(const real *src, real *dst, const int N)
|
||
{
|
||
const int nx = threadIdx.x + blockIdx.x * c_TILE_DIM;
|
||
const int ny = threadIdx.y + blockIdx.y * c_TILE_DIM;
|
||
|
||
if (nx < N && ny < N)
|
||
{
|
||
// 矩阵转置(非合并读取、合并写入)。
|
||
dst[ny*N + nx] = __ldg(&src[nx*N + ny]); // 显示调用 `__ldg()` 函数缓存全局内存。
|
||
}
|
||
}
|
||
|
||
__global__ void transpose3(const real *src, real *dst, const int N)
|
||
{
|
||
// 正常的做法中,全局内存的读写必有一个是非合并访问。
|
||
// 现在通过将非合并访问转移到共享内存,利用共享内存的高性能(100倍全局内存),提高计算速度:
|
||
// 1. 首先将全局内存拷贝到线程块的共享内存;
|
||
// 2. 然后从共享内存非合并访问,读取数据,合并写入全局内存。
|
||
|
||
__shared__ real s_mat[TILE_DIM][TILE_DIM]; //二维静态共享内存,存储线程块内的一片矩阵。
|
||
|
||
int bx = blockIdx.x * blockDim.x; // 当前线程块首线程在网格中列索引。
|
||
int by = blockIdx.y * blockDim.y; // 当前线程块首线程在网格中行索引。
|
||
|
||
int tx = threadIdx.x + bx; // 当前线程在网格中列索引。
|
||
int ty = threadIdx.y + by; // 当前线程在网格中行索引。
|
||
|
||
if (tx < N && ty < N)
|
||
{
|
||
// 全局内存合并访问,共享内存合并访问。
|
||
s_mat[threadIdx.y][threadIdx.x] = src[ty * N + tx]; // 全局内存中二维矩阵一维存储。
|
||
}
|
||
__syncthreads();
|
||
|
||
// 全局内存合并访问。
|
||
if (tx < N && ty < N)
|
||
{
|
||
// 局部矩阵转置和全局内存合并写入。
|
||
int x = by + threadIdx.x;
|
||
int y = bx + threadIdx.y;
|
||
dst[y * N + x] = s_mat[threadIdx.x][threadIdx.y];
|
||
}
|
||
}
|
||
|
||
__global__ void transpose4(const real *src, real *dst, const int N)
|
||
{
|
||
// 通过修改数组行大小,错开数组元素在共享内存bank中的分布,
|
||
// 避免线程束的 32路bank冲突。
|
||
__shared__ real s_mat[TILE_DIM][TILE_DIM + 1];
|
||
|
||
int bx = blockIdx.x * blockDim.x;
|
||
int by = blockIdx.y * blockDim.y;
|
||
|
||
int tx = threadIdx.x + bx;
|
||
int ty = threadIdx.y + by;
|
||
|
||
if (tx < N && ty < N)
|
||
{
|
||
s_mat[threadIdx.y][threadIdx.x] = src[ty * N + tx];
|
||
}
|
||
__syncthreads();
|
||
|
||
if (tx < N && ty < N)
|
||
{
|
||
int x = by + threadIdx.x;
|
||
int y = bx + threadIdx.y;
|
||
dst[y * N + x] = s_mat[threadIdx.x][threadIdx.y];
|
||
}
|
||
} |