forked from OSchip/llvm-project
				
			
		
			
				
	
	
		
			110 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
			
		
		
	
	
			110 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
//===--- simple_example.cu - Simple example of using Acxxel ---------------===//
 | 
						|
//
 | 
						|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 | 
						|
// See https://llvm.org/LICENSE.txt for license information.
 | 
						|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 | 
						|
//
 | 
						|
//===----------------------------------------------------------------------===//
 | 
						|
///
 | 
						|
/// This file is a simple example of using Acxxel.
 | 
						|
///
 | 
						|
//===----------------------------------------------------------------------===//
 | 
						|
 | 
						|
/// [Example simple saxpy]
 | 
						|
#include "acxxel.h"
 | 
						|
 | 
						|
#include <array>
 | 
						|
#include <cstdio>
 | 
						|
#include <cstdlib>
 | 
						|
 | 
						|
// A standard CUDA kernel.
 | 
						|
__global__ void saxpyKernel(float A, float *X, float *Y, int N) {
 | 
						|
  int I = (blockDim.x * blockIdx.x) + threadIdx.x;
 | 
						|
  if (I < N)
 | 
						|
    X[I] = A * X[I] + Y[I];
 | 
						|
}
 | 
						|
 | 
						|
// A host library wrapping the CUDA kernel. All Acxxel calls are in here.
 | 
						|
template <size_t N>
 | 
						|
void saxpy(float A, std::array<float, N> &X, const std::array<float, N> &Y) {
 | 
						|
  // Get the CUDA platform and make a CUDA stream.
 | 
						|
  acxxel::Platform *CUDA = acxxel::getCUDAPlatform().getValue();
 | 
						|
  acxxel::Stream Stream = CUDA->createStream().takeValue();
 | 
						|
 | 
						|
  // Allocate space for device arrays.
 | 
						|
  auto DeviceX = CUDA->mallocD<float>(N).takeValue();
 | 
						|
  auto DeviceY = CUDA->mallocD<float>(N).takeValue();
 | 
						|
 | 
						|
  // Copy X and Y out to the device.
 | 
						|
  Stream.syncCopyHToD(X, DeviceX).syncCopyHToD(Y, DeviceY);
 | 
						|
 | 
						|
  // Launch the kernel using triple-chevron notation.
 | 
						|
  saxpyKernel<<<1, N, 0, Stream>>>(A, DeviceX, DeviceY, N);
 | 
						|
 | 
						|
  // Copy the results back to the host.
 | 
						|
  acxxel::Status Status = Stream.syncCopyDToH(DeviceX, X).takeStatus();
 | 
						|
 | 
						|
  // Check for any errors.
 | 
						|
  if (Status.isError()) {
 | 
						|
    std::fprintf(stderr, "Error performing acxxel saxpy: %s\n",
 | 
						|
                 Status.getMessage().c_str());
 | 
						|
    std::exit(EXIT_FAILURE);
 | 
						|
  }
 | 
						|
}
 | 
						|
/// [Example simple saxpy]
 | 
						|
 | 
						|
/// [Example CUDA simple saxpy]
 | 
						|
template <size_t N>
 | 
						|
void cudaSaxpy(float A, std::array<float, N> &X, std::array<float, N> &Y) {
 | 
						|
  // This size is needed all over the place, so give it a name.
 | 
						|
  constexpr size_t Size = N * sizeof(float);
 | 
						|
 | 
						|
  // Allocate space for device arrays.
 | 
						|
  float *DeviceX;
 | 
						|
  float *DeviceY;
 | 
						|
  cudaMalloc(&DeviceX, Size);
 | 
						|
  cudaMalloc(&DeviceY, Size);
 | 
						|
 | 
						|
  // Copy X and Y out to the device.
 | 
						|
  cudaMemcpy(DeviceX, X.data(), Size, cudaMemcpyHostToDevice);
 | 
						|
  cudaMemcpy(DeviceY, Y.data(), Size, cudaMemcpyHostToDevice);
 | 
						|
 | 
						|
  // Launch the kernel using triple-chevron notation.
 | 
						|
  saxpyKernel<<<1, N>>>(A, DeviceX, DeviceY, N);
 | 
						|
 | 
						|
  // Copy the results back to the host.
 | 
						|
  cudaMemcpy(X.data(), DeviceX, Size, cudaMemcpyDeviceToHost);
 | 
						|
 | 
						|
  // Free resources.
 | 
						|
  cudaFree(DeviceX);
 | 
						|
  cudaFree(DeviceY);
 | 
						|
 | 
						|
  // Check for any errors.
 | 
						|
  cudaError_t Error = cudaGetLastError();
 | 
						|
  if (Error) {
 | 
						|
    std::fprintf(stderr, "Error performing cudart saxpy: %s\n",
 | 
						|
                 cudaGetErrorString(Error));
 | 
						|
    std::exit(EXIT_FAILURE);
 | 
						|
  }
 | 
						|
}
 | 
						|
/// [Example CUDA simple saxpy]
 | 
						|
 | 
						|
template <typename F> void testSaxpy(F &&SaxpyFunction) {
 | 
						|
  float A = 2.f;
 | 
						|
  std::array<float, 3> X = {{0.f, 1.f, 2.f}};
 | 
						|
  std::array<float, 3> Y = {{3.f, 4.f, 5.f}};
 | 
						|
  std::array<float, 3> Expected = {{3.f, 6.f, 9.f}};
 | 
						|
  SaxpyFunction(A, X, Y);
 | 
						|
  for (int I = 0; I < 3; ++I)
 | 
						|
    if (X[I] != Expected[I]) {
 | 
						|
      std::fprintf(stderr, "Result mismatch at index %d, %f != %f\n", I, X[I],
 | 
						|
                   Expected[I]);
 | 
						|
      std::exit(EXIT_FAILURE);
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
int main() {
 | 
						|
  testSaxpy(saxpy<3>);
 | 
						|
  testSaxpy(cudaSaxpy<3>);
 | 
						|
}
 |