CudaSteps/capter9/neighbor.cu

238 lines
6.3 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "../common/error.cuh"
#include "../common/floats.hpp"
#include "../common/clock.cuh"
#include <fstream>
#include <regex>
#include <string>
#include <vector>
void read_data(const std::string &fstr, std::vector<real> &x, std::vector<real> &y);
void write_data(const std::string &fstr, const int *NL, const int N, const int M);
void find_neighbor(int *NN, int *NL, const real *x, const real *y,
const int N, const int M,
const real minDis);
__global__ void find_neighbor_gpu (int *NN, int *NL, const real *x, const real *y,
const int N, const int M,
const real mindDis);
__global__ void find_neighbor_atomic(int *NN, int *NL, const real *x, const real *y,
const int N, const int M,
const real minDis);
int main()
{
cout << FLOAT_PREC << endl;
std::string fstr = "xy.txt";
std::string fout = "result.txt";
std::vector<real> x, y;
read_data(fstr, x, y);
int N = x.size(), M = 10;
real minDis = 1.9*1.9;
int *NN = new int[N];
int *NL = new int[N*M];
for (int i = 0; i < N; ++i)
{
NN[i] = 0;
for (int j = 0; j < M; ++j)
{
NL[i*M + j] = -1;
}
}
int *d_NN, *d_NL;
CHECK(cudaMalloc(&d_NN, N*sizeof(int)));
CHECK(cudaMalloc(&d_NL, N*M*sizeof(int)));
real *d_x, *d_y;
CHECK(cudaMalloc(&d_x, N*sizeof(real)));
CHECK(cudaMalloc(&d_y, N*sizeof(real)));
cppClockStart
find_neighbor(NN, NL, x.data(), y.data(), N, M, minDis);
// write_data(fout, NL, N, M);
cppClockCurr
cudaClockStart
CHECK(cudaMemcpy(d_x, x.data(), N*sizeof(real), cudaMemcpyDefault));
CHECK(cudaMemcpy(d_y, y.data(), N*sizeof(real), cudaMemcpyDefault));
int block_size = 128;
int grid_size = (N + block_size - 1)/block_size;
find_neighbor_atomic<<<grid_size, block_size>>>(d_NN, d_NL, d_x, d_y, N, M, minDis);
CHECK(cudaMemcpy(NN, d_NN, N*sizeof(int), cudaMemcpyDefault));
CHECK(cudaMemcpy(NL, d_NL, N*M*sizeof(int), cudaMemcpyDefault));
// write_data(fout, NL, N, M);
cudaClockCurr
CHECK(cudaMemcpy(d_x, x.data(), N*sizeof(real), cudaMemcpyDefault));
CHECK(cudaMemcpy(d_y, y.data(), N*sizeof(real), cudaMemcpyDefault));
find_neighbor_gpu<<<grid_size, block_size>>>(d_NN, d_NL, d_x, d_y, N, M, minDis);
CHECK(cudaMemcpy(NN, d_NN, N*sizeof(int), cudaMemcpyDefault));
CHECK(cudaMemcpy(NL, d_NL, N*M*sizeof(int), cudaMemcpyDefault));
cudaClockCurr
write_data(fout, NL, N, M);
delete[] NN;
delete[] NL;
CHECK(cudaFree(d_NN));
CHECK(cudaFree(d_NL));
CHECK(cudaFree(d_x));
CHECK(cudaFree(d_y));
return 0;
}
void find_neighbor(int *NN, int *NL, const real *x, const real *y,
const int N, const int M,
const real minDis)
{
for (int i = 0; i < N; ++i)
{
NN[i] = 0;
}
for (int i = 0; i < N; ++i)
{
for (int j = i + 1; j < N; ++j)
{
real dx = x[j] - x[i];
real dy = y[j] - y[i];
real dis = dx * dx + dy * dy;
if (dis < minDis) // 比较平方,减少计算量。
{
NL[i*M + NN[i]] = j; // 一维数组存放二维数据。
NN[i] ++;
NL[j*M + NN[j]] = i; // 省去一般的判断。
NN[j]++;
}
}
}
}
__global__ void find_neighbor_gpu (int *NN, int *NL, const real *x, const real *y,
const int N, const int M,
const real minDis)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
{
int count = 0; // 寄存器变量减少对全局变量NN的访问。
for (int j = 0; j < N; ++j) // 访问次数 N*N性能降低。
{
real dx = x[j] - x[i];
real dy = y[j] - y[i];
real dis = dx * dx + dy * dy;
if (dis < minDis && i != j) // 距离判断优先,提高“假”的命中率。
{
// 修改了全局内存NL的数据排列方式实现合并访问i 与 threadIdx.x的变化步调一致
// ???
NL[(count++) * N + i] = j;
}
}
NN[i] = count;
}
}
__global__ void find_neighbor_atomic(int *NN, int *NL, const real *x, const real *y,
const int N, const int M,
const real minDis)
{
// 将 cpu 版本的第一层循环展开,一个线程对应一个原子操作。
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
{
NN[i] = 0;
for (int j = i + 1; j < N; ++j)
{
real dx = x[j] - x[i];
real dy = y[j] - y[i];
real dis = dx * dx + dy*dy;
if (dis < minDis)
{
// 原子函数提高的性能但是在NL中产生了一定的随机性不便于后期调试。
int old_i_num = atomicAdd(&NN[i], 1); // 返回值为旧值,当前线程对应点的邻居数
NL[i*M + old_i_num] = j; // 当前线程对应点的新邻居
int old_j_num = atomicAdd(&NN[j], 1); // 返回值为旧值,当前邻居点的邻居数
NL[j*M + old_j_num] = i; // 当前邻居点的新邻居
}
}
}
}
void read_data(const std::string &fstr, std::vector<real> &x, std::vector<real> &y)
{
x.clear();
y.clear();
std::fstream reader(fstr, std::ios::in);
if (!reader.is_open())
{
std::cerr << "data file open failed.\n";
return;
}
std::regex re{"[\\s,]+"};
std::string line;
while(std::getline(reader, line))
{
std::vector<std::string> arr{std::sregex_token_iterator(line.begin(), line.end(), re, -1),
std::sregex_token_iterator()};
if (arr.size() < 2 || arr[0].find("#") != std::string::npos)
{
continue;
}
x.push_back(stod(arr[0]));
y.push_back(stod(arr[1]));
}
}
void write_data(const std::string &fstr, const int *NL, const int N, const int M)
{
std::fstream writer(fstr, std::ios::out);
if (!writer.is_open())
{
std::cerr << "result file open failed.\n";
return;
}
for (int i = 0; i < N; ++i)
{
writer << i << "\t";
for (int j = 0; j < M; ++j)
{
int ind = NL[i*M + j];
if (ind >= 0)
{
writer << ind << "\t";
}
}
writer << endl;
}
}