forked from OSchip/llvm-project
GPGPU: Emit data-transfer code
Also factor out getArraySize() to avoid code dupliciation and reorder some function arguments to indicate the direction into which data is transferred. llvm-svn: 276636
This commit is contained in:
parent
7287aeddf1
commit
13c78e4d51
|
|
@ -148,8 +148,8 @@ private:
|
|||
/// more.
|
||||
std::vector<Value *> LocalArrays;
|
||||
|
||||
/// A list of device arrays that has been allocated.
|
||||
std::vector<Value *> AllocatedDevArrays;
|
||||
/// A map from ScopArrays to their corresponding device allocations.
|
||||
std::map<ScopArrayInfo *, Value *> DeviceAllocations;
|
||||
|
||||
/// The current GPU context.
|
||||
Value *GPUContext;
|
||||
|
|
@ -181,13 +181,22 @@ private:
|
|||
///
|
||||
/// - ScopStmt: A computational statement (TODO)
|
||||
/// - Kernel: A GPU kernel call (TODO)
|
||||
/// - Data-Transfer: A GPU <-> CPU data-transfer (TODO)
|
||||
/// - Data-Transfer: A GPU <-> CPU data-transfer
|
||||
/// - In-kernel synchronization
|
||||
/// - In-kernel memory copy statement
|
||||
///
|
||||
/// @param UserStmt The ast node to generate code for.
|
||||
virtual void createUser(__isl_take isl_ast_node *UserStmt);
|
||||
|
||||
enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST };
|
||||
|
||||
/// Create code for a data transfer statement
|
||||
///
|
||||
/// @param TransferStmt The data transfer statement.
|
||||
/// @param Direction The direction in which to transfer data.
|
||||
void createDataTransfer(__isl_take isl_ast_node *TransferStmt,
|
||||
enum DataDirection Direction);
|
||||
|
||||
/// Find llvm::Values referenced in GPU kernel.
|
||||
///
|
||||
/// @param Kernel The kernel to scan for llvm::Values
|
||||
|
|
@ -202,6 +211,11 @@ private:
|
|||
/// @param KernelStmt The ast node to generate kernel code for.
|
||||
void createKernel(__isl_take isl_ast_node *KernelStmt);
|
||||
|
||||
/// Generate code that computes the size of an array.
|
||||
///
|
||||
/// @param Array The array for which to compute a size.
|
||||
Value *getArraySize(gpu_array_info *Array);
|
||||
|
||||
/// Create kernel function.
|
||||
///
|
||||
/// Create a kernel function located in a newly created module that can serve
|
||||
|
|
@ -296,6 +310,20 @@ private:
|
|||
///
|
||||
/// @param Array The device array to free.
|
||||
void createCallFreeDeviceMemory(Value *Array);
|
||||
|
||||
/// Create a call to copy data from host to device.
|
||||
///
|
||||
/// @param HostPtr A pointer to the host data that should be copied.
|
||||
/// @param DevicePtr A device pointer specifying the location to copy to.
|
||||
void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr,
|
||||
Value *Size);
|
||||
|
||||
/// Create a call to copy data from device to host.
|
||||
///
|
||||
/// @param DevicePtr A pointer to the device data that should be copied.
|
||||
/// @param HostPtr A host pointer specifying the location to copy to.
|
||||
void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr,
|
||||
Value *Size);
|
||||
};
|
||||
|
||||
void GPUNodeBuilder::initializeAfterRTH() {
|
||||
|
|
@ -314,36 +342,22 @@ void GPUNodeBuilder::allocateDeviceArrays() {
|
|||
|
||||
for (int i = 0; i < Prog->n_array; ++i) {
|
||||
gpu_array_info *Array = &Prog->array[i];
|
||||
auto *ScopArray = (ScopArrayInfo *)Array->user;
|
||||
std::string DevArrayName("p_dev_array_");
|
||||
DevArrayName.append(Array->name);
|
||||
|
||||
Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size);
|
||||
|
||||
if (!gpu_array_is_scalar(Array)) {
|
||||
auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]);
|
||||
isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero);
|
||||
|
||||
for (unsigned int i = 1; i < Array->n_index; i++) {
|
||||
isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]);
|
||||
isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I);
|
||||
Res = isl_ast_expr_mul(Res, Expr);
|
||||
}
|
||||
|
||||
Value *NumElements = ExprBuilder.create(Res);
|
||||
ArraySize = Builder.CreateMul(ArraySize, NumElements);
|
||||
}
|
||||
|
||||
Value *ArraySize = getArraySize(Array);
|
||||
Value *DevArray = createCallAllocateMemoryForDevice(ArraySize);
|
||||
DevArray->setName(DevArrayName);
|
||||
AllocatedDevArrays.push_back(DevArray);
|
||||
DeviceAllocations[ScopArray] = DevArray;
|
||||
}
|
||||
|
||||
isl_ast_build_free(Build);
|
||||
}
|
||||
|
||||
void GPUNodeBuilder::freeDeviceArrays() {
|
||||
for (auto &Array : AllocatedDevArrays)
|
||||
createCallFreeDeviceMemory(Array);
|
||||
for (auto &Array : DeviceAllocations)
|
||||
createCallFreeDeviceMemory(Array.second);
|
||||
}
|
||||
|
||||
void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) {
|
||||
|
|
@ -380,6 +394,48 @@ Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) {
|
|||
return Builder.CreateCall(F, {Size});
|
||||
}
|
||||
|
||||
void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData,
|
||||
Value *DeviceData,
|
||||
Value *Size) {
|
||||
const char *Name = "polly_copyFromHostToDevice";
|
||||
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||
Function *F = M->getFunction(Name);
|
||||
|
||||
// If F is not available, declare it.
|
||||
if (!F) {
|
||||
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||
std::vector<Type *> Args;
|
||||
Args.push_back(Builder.getInt8PtrTy());
|
||||
Args.push_back(Builder.getInt8PtrTy());
|
||||
Args.push_back(Builder.getInt64Ty());
|
||||
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
||||
F = Function::Create(Ty, Linkage, Name, M);
|
||||
}
|
||||
|
||||
Builder.CreateCall(F, {HostData, DeviceData, Size});
|
||||
}
|
||||
|
||||
void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData,
|
||||
Value *HostData,
|
||||
Value *Size) {
|
||||
const char *Name = "polly_copyFromDeviceToHost";
|
||||
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||
Function *F = M->getFunction(Name);
|
||||
|
||||
// If F is not available, declare it.
|
||||
if (!F) {
|
||||
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||
std::vector<Type *> Args;
|
||||
Args.push_back(Builder.getInt8PtrTy());
|
||||
Args.push_back(Builder.getInt8PtrTy());
|
||||
Args.push_back(Builder.getInt64Ty());
|
||||
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
||||
F = Function::Create(Ty, Linkage, Name, M);
|
||||
}
|
||||
|
||||
Builder.CreateCall(F, {DeviceData, HostData, Size});
|
||||
}
|
||||
|
||||
Value *GPUNodeBuilder::createCallInitContext() {
|
||||
const char *Name = "polly_initContext";
|
||||
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
||||
|
|
@ -421,6 +477,58 @@ static bool isPrefix(std::string String, std::string Prefix) {
|
|||
return String.find(Prefix) == 0;
|
||||
}
|
||||
|
||||
Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) {
|
||||
isl_ast_build *Build = isl_ast_build_from_context(S.getContext());
|
||||
Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size);
|
||||
|
||||
if (!gpu_array_is_scalar(Array)) {
|
||||
auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]);
|
||||
isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero);
|
||||
|
||||
for (unsigned int i = 1; i < Array->n_index; i++) {
|
||||
isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]);
|
||||
isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I);
|
||||
Res = isl_ast_expr_mul(Res, Expr);
|
||||
}
|
||||
|
||||
Value *NumElements = ExprBuilder.create(Res);
|
||||
ArraySize = Builder.CreateMul(ArraySize, NumElements);
|
||||
}
|
||||
isl_ast_build_free(Build);
|
||||
return ArraySize;
|
||||
}
|
||||
|
||||
void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
|
||||
enum DataDirection Direction) {
|
||||
isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt);
|
||||
isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0);
|
||||
isl_id *Id = isl_ast_expr_get_id(Arg);
|
||||
auto Array = (gpu_array_info *)isl_id_get_user(Id);
|
||||
auto ScopArray = (ScopArrayInfo *)(Array->user);
|
||||
|
||||
Value *Size = getArraySize(Array);
|
||||
Value *HostPtr = ScopArray->getBasePtr();
|
||||
|
||||
Value *DevPtr = DeviceAllocations[ScopArray];
|
||||
|
||||
if (gpu_array_is_scalar(Array)) {
|
||||
HostPtr = Builder.CreateAlloca(ScopArray->getElementType());
|
||||
Builder.CreateStore(ScopArray->getBasePtr(), HostPtr);
|
||||
}
|
||||
|
||||
HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
|
||||
|
||||
if (Direction == HOST_TO_DEVICE)
|
||||
createCallCopyFromHostToDevice(HostPtr, DevPtr, Size);
|
||||
else
|
||||
createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size);
|
||||
|
||||
isl_id_free(Id);
|
||||
isl_ast_expr_free(Arg);
|
||||
isl_ast_expr_free(Expr);
|
||||
isl_ast_node_free(TransferStmt);
|
||||
}
|
||||
|
||||
void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) {
|
||||
isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt);
|
||||
isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
|
||||
|
|
@ -435,10 +543,15 @@ void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) {
|
|||
return;
|
||||
}
|
||||
|
||||
if (isPrefix(Str, "to_device") || isPrefix(Str, "from_device")) {
|
||||
// TODO: Insert memory copies
|
||||
if (isPrefix(Str, "to_device")) {
|
||||
createDataTransfer(UserStmt, HOST_TO_DEVICE);
|
||||
isl_ast_expr_free(Expr);
|
||||
return;
|
||||
}
|
||||
|
||||
if (isPrefix(Str, "from_device")) {
|
||||
createDataTransfer(UserStmt, DEVICE_TO_HOST);
|
||||
isl_ast_expr_free(Expr);
|
||||
isl_ast_node_free(UserStmt);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -1202,6 +1315,7 @@ public:
|
|||
PPCGArray.global = false;
|
||||
PPCGArray.linearize = false;
|
||||
PPCGArray.dep_order = nullptr;
|
||||
PPCGArray.user = Array;
|
||||
|
||||
setArrayBounds(PPCGArray, Array);
|
||||
i++;
|
||||
|
|
|
|||
|
|
@ -54,6 +54,8 @@ struct gpu_array_info {
|
|||
* It is set to NULL otherwise.
|
||||
*/
|
||||
isl_union_map *dep_order;
|
||||
|
||||
void *user;
|
||||
};
|
||||
|
||||
/* Represents an outer array accessed by a ppcg_kernel, localized
|
||||
|
|
|
|||
|
|
@ -94,6 +94,10 @@
|
|||
; IR: polly.start:
|
||||
; IR-NEXT: [[GPUContext:%.*]] = call i8* @polly_initContext()
|
||||
; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 4194304)
|
||||
; IR-NEXT: [[HostPtr:%.*]] = bitcast [1024 x float]* %A to i8*
|
||||
; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[HostPtr]], i8* %p_dev_array_MemRef_A, i64 4194304)
|
||||
; IR-NEXT: [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8*
|
||||
; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304)
|
||||
; IR-NEXT: call void @polly_freeDeviceMemory(i8* %p_dev_array_MemRef_A)
|
||||
; IR-NEXT: call void @polly_freeContext(i8* [[GPUContext]])
|
||||
; IR-NEXT: br label %polly.exiting
|
||||
|
|
|
|||
|
|
@ -296,16 +296,16 @@ void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
|
|||
}
|
||||
}
|
||||
|
||||
void polly_copyFromHostToDevice(PollyGPUDevicePtr *DevData, void *HostData,
|
||||
int MemSize) {
|
||||
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
|
||||
long MemSize) {
|
||||
dump_function();
|
||||
|
||||
CUdeviceptr CuDevData = DevData->Cuda;
|
||||
CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
|
||||
}
|
||||
|
||||
void polly_copyFromDeviceToHost(void *HostData, PollyGPUDevicePtr *DevData,
|
||||
int MemSize) {
|
||||
void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
|
||||
long MemSize) {
|
||||
dump_function();
|
||||
|
||||
if (CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) {
|
||||
|
|
|
|||
|
|
@ -78,10 +78,10 @@ PollyGPUContext *polly_initContext();
|
|||
void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module);
|
||||
void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
|
||||
PollyGPUFunction **Kernel);
|
||||
void polly_copyFromHostToDevice(PollyGPUDevicePtr *DevData, void *HostData,
|
||||
int MemSize);
|
||||
void polly_copyFromDeviceToHost(void *HostData, PollyGPUDevicePtr *DevData,
|
||||
int MemSize);
|
||||
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
|
||||
long MemSize);
|
||||
void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
|
||||
long MemSize);
|
||||
void polly_setKernelParameters(PollyGPUFunction *Kernel, int BlockWidth,
|
||||
int BlockHeight, PollyGPUDevicePtr *DevData);
|
||||
void polly_launchKernel(PollyGPUFunction *Kernel, int GridWidth,
|
||||
|
|
|
|||
Loading…
Reference in New Issue