forked from OSchip/llvm-project
				
			[OPENMP][NVPTX] Add support for lastprivates/reductions handling in SPMD constructs with lightweight runtime.
Summary: We need the support for per-team shared variables to support codegen for lastprivates/reductions. Patch adds this support by using shared memory if the total size of the reductions/lastprivates is <= 128 bytes, then pre-allocated buffer in global memory if size is <= 4K bytes,or uses malloc/free, otherwise. Reviewers: gtbercea, kkwli0, grokos Subscribers: guansong, openmp-commits Differential Revision: https://reviews.llvm.org/D51875 llvm-svn: 342737
This commit is contained in:
		
							parent
							
								
									8171bd8e0f
								
							
						
					
					
						commit
						022bf16b41
					
				| 
						 | 
					@ -378,6 +378,12 @@ EXTERN void __kmpc_data_sharing_init_stack_spmd() {
 | 
				
			||||||
// as long as the size requested fits the pre-allocated size.
 | 
					// as long as the size requested fits the pre-allocated size.
 | 
				
			||||||
EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
 | 
					EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
 | 
				
			||||||
    int16_t UseSharedMemory) {
 | 
					    int16_t UseSharedMemory) {
 | 
				
			||||||
 | 
					  if (isRuntimeUninitialized()) {
 | 
				
			||||||
 | 
					    ASSERT0(LT_FUSSY, isSPMDMode(),
 | 
				
			||||||
 | 
					            "Expected SPMD mode with uninitialized runtime.");
 | 
				
			||||||
 | 
					    return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Frame pointer must be visible to all workers in the same warp.
 | 
					  // Frame pointer must be visible to all workers in the same warp.
 | 
				
			||||||
  unsigned WID = getWarpId();
 | 
					  unsigned WID = getWarpId();
 | 
				
			||||||
  void *&FrameP = DataSharingState.FramePtr[WID];
 | 
					  void *&FrameP = DataSharingState.FramePtr[WID];
 | 
				
			||||||
| 
						 | 
					@ -456,6 +462,12 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
 | 
				
			||||||
// reclaim all outstanding global memory slots since it is
 | 
					// reclaim all outstanding global memory slots since it is
 | 
				
			||||||
// likely we have reached the end of the kernel.
 | 
					// likely we have reached the end of the kernel.
 | 
				
			||||||
EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
 | 
					EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
 | 
				
			||||||
 | 
					  if (isRuntimeUninitialized()) {
 | 
				
			||||||
 | 
					    ASSERT0(LT_FUSSY, isSPMDMode(),
 | 
				
			||||||
 | 
					            "Expected SPMD mode with uninitialized runtime.");
 | 
				
			||||||
 | 
					    return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (IsWarpMasterActiveThread()) {
 | 
					  if (IsWarpMasterActiveThread()) {
 | 
				
			||||||
    unsigned WID = getWarpId();
 | 
					    unsigned WID = getWarpId();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -38,6 +38,8 @@ __device__ __shared__
 | 
				
			||||||
__device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
 | 
					__device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
 | 
				
			||||||
    *omptarget_nvptx_simpleThreadPrivateContext;
 | 
					    *omptarget_nvptx_simpleThreadPrivateContext;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					__device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
////////////////////////////////////////////////////////////////////////////////
 | 
					////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
// The team master sets the outlined parallel function in this variable to
 | 
					// The team master sets the outlined parallel function in this variable to
 | 
				
			||||||
// communicate with the workers.  Since it is in shared memory, there is one
 | 
					// communicate with the workers.  Since it is in shared memory, there is one
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -25,13 +25,23 @@ extern __device__ omptarget_nvptx_Queue<
 | 
				
			||||||
    omptarget_nvptx_SimpleThreadPrivateContext, OMP_STATE_COUNT>
 | 
					    omptarget_nvptx_SimpleThreadPrivateContext, OMP_STATE_COUNT>
 | 
				
			||||||
    omptarget_nvptx_device_simpleState[MAX_SM];
 | 
					    omptarget_nvptx_device_simpleState[MAX_SM];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
////////////////////////////////////////////////////////////////////////////////
 | 
					////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
// init entry points
 | 
					// init entry points
 | 
				
			||||||
////////////////////////////////////////////////////////////////////////////////
 | 
					////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					INLINE unsigned nsmid() {
 | 
				
			||||||
 | 
					  unsigned n;
 | 
				
			||||||
 | 
					  asm("mov.u32 %0, %%nsmid;" : "=r"(n));
 | 
				
			||||||
 | 
					  return n;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
INLINE unsigned smid() {
 | 
					INLINE unsigned smid() {
 | 
				
			||||||
  unsigned id;
 | 
					  unsigned id;
 | 
				
			||||||
  asm("mov.u32 %0, %%smid;" : "=r"(id));
 | 
					  asm("mov.u32 %0, %%smid;" : "=r"(id));
 | 
				
			||||||
 | 
					  ASSERT0(LT_FUSSY, nsmid() <= MAX_SM,
 | 
				
			||||||
 | 
					          "Expected number of SMs is less than reported.");
 | 
				
			||||||
  return id;
 | 
					  return id;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -108,6 +118,10 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
 | 
				
			||||||
      int slot = smid() % MAX_SM;
 | 
					      int slot = smid() % MAX_SM;
 | 
				
			||||||
      omptarget_nvptx_simpleThreadPrivateContext =
 | 
					      omptarget_nvptx_simpleThreadPrivateContext =
 | 
				
			||||||
          omptarget_nvptx_device_simpleState[slot].Dequeue();
 | 
					          omptarget_nvptx_device_simpleState[slot].Dequeue();
 | 
				
			||||||
 | 
					      // Reuse the memory allocated for the full runtime as the preallocated
 | 
				
			||||||
 | 
					      // global memory buffer for the lightweight runtime.
 | 
				
			||||||
 | 
					      omptarget_nvptx_simpleGlobalData =
 | 
				
			||||||
 | 
					          omptarget_nvptx_device_State[slot].Dequeue();
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    __syncthreads();
 | 
					    __syncthreads();
 | 
				
			||||||
    omptarget_nvptx_simpleThreadPrivateContext->Init();
 | 
					    omptarget_nvptx_simpleThreadPrivateContext->Init();
 | 
				
			||||||
| 
						 | 
					@ -177,6 +191,10 @@ EXTERN void __kmpc_spmd_kernel_deinit() {
 | 
				
			||||||
      int slot = smid() % MAX_SM;
 | 
					      int slot = smid() % MAX_SM;
 | 
				
			||||||
      omptarget_nvptx_device_simpleState[slot].Enqueue(
 | 
					      omptarget_nvptx_device_simpleState[slot].Enqueue(
 | 
				
			||||||
          omptarget_nvptx_simpleThreadPrivateContext);
 | 
					          omptarget_nvptx_simpleThreadPrivateContext);
 | 
				
			||||||
 | 
					      // Enqueue global memory back.
 | 
				
			||||||
 | 
					      omptarget_nvptx_device_State[slot].Enqueue(
 | 
				
			||||||
 | 
					          reinterpret_cast<omptarget_nvptx_ThreadPrivateContext *>(
 | 
				
			||||||
 | 
					              omptarget_nvptx_simpleGlobalData));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    return;
 | 
					    return;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -113,6 +113,8 @@ enum DATA_SHARING_SIZES {
 | 
				
			||||||
  DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
 | 
					  DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
 | 
				
			||||||
  // The maximum number of warps in use
 | 
					  // The maximum number of warps in use
 | 
				
			||||||
  DS_Max_Warp_Number = 32,
 | 
					  DS_Max_Warp_Number = 32,
 | 
				
			||||||
 | 
					  // The size of the preallocated shared memory buffer per team
 | 
				
			||||||
 | 
					  DS_Shared_Memory_Size = 128,
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Data structure to keep in shared memory that traces the current slot, stack,
 | 
					// Data structure to keep in shared memory that traces the current slot, stack,
 | 
				
			||||||
| 
						 | 
					@ -386,12 +388,15 @@ struct omptarget_device_environmentTy {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class omptarget_nvptx_SimpleThreadPrivateContext {
 | 
					class omptarget_nvptx_SimpleThreadPrivateContext {
 | 
				
			||||||
  uint16_t par_level[MAX_THREADS_PER_TEAM];
 | 
					  uint16_t par_level[MAX_THREADS_PER_TEAM];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
  INLINE void Init() {
 | 
					  INLINE void Init() {
 | 
				
			||||||
    ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
 | 
					    ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
 | 
				
			||||||
            "Expected SPMD + uninitialized runtime modes.");
 | 
					            "Expected SPMD + uninitialized runtime modes.");
 | 
				
			||||||
    par_level[GetThreadIdInBlock()] = 0;
 | 
					    par_level[GetThreadIdInBlock()] = 0;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  static INLINE void *Allocate(size_t DataSize);
 | 
				
			||||||
 | 
					  static INLINE void Deallocate(void *Ptr);
 | 
				
			||||||
  INLINE void IncParLevel() {
 | 
					  INLINE void IncParLevel() {
 | 
				
			||||||
    ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
 | 
					    ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
 | 
				
			||||||
            "Expected SPMD + uninitialized runtime modes.");
 | 
					            "Expected SPMD + uninitialized runtime modes.");
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -202,3 +202,36 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
 | 
				
			||||||
INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() {
 | 
					INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() {
 | 
				
			||||||
  return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
 | 
					  return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					// Lightweight runtime functions.
 | 
				
			||||||
 | 
					////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Shared memory buffer for globalization support.
 | 
				
			||||||
 | 
					static __align__(16) __device__ __shared__ char
 | 
				
			||||||
 | 
					    omptarget_static_buffer[DS_Shared_Memory_Size];
 | 
				
			||||||
 | 
					static __device__ __shared__ void *omptarget_spmd_allocated;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					INLINE void *
 | 
				
			||||||
 | 
					omptarget_nvptx_SimpleThreadPrivateContext::Allocate(size_t DataSize) {
 | 
				
			||||||
 | 
					  if (DataSize <= DS_Shared_Memory_Size)
 | 
				
			||||||
 | 
					    return ::omptarget_static_buffer;
 | 
				
			||||||
 | 
					  if (DataSize <= sizeof(omptarget_nvptx_ThreadPrivateContext))
 | 
				
			||||||
 | 
					    return ::omptarget_nvptx_simpleGlobalData;
 | 
				
			||||||
 | 
					  if (threadIdx.x == 0)
 | 
				
			||||||
 | 
					    omptarget_spmd_allocated = SafeMalloc(DataSize, "SPMD teams alloc");
 | 
				
			||||||
 | 
					  __syncthreads();
 | 
				
			||||||
 | 
					  return omptarget_spmd_allocated;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					INLINE void
 | 
				
			||||||
 | 
					omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(void *Ptr) {
 | 
				
			||||||
 | 
					  if (Ptr != ::omptarget_static_buffer &&
 | 
				
			||||||
 | 
					      Ptr != ::omptarget_nvptx_simpleGlobalData) {
 | 
				
			||||||
 | 
					    __syncthreads();
 | 
				
			||||||
 | 
					    if (threadIdx.x == 0)
 | 
				
			||||||
 | 
					      SafeFree(Ptr, "SPMD teams dealloc");
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -34,7 +34,10 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Maximum number of omp state objects per SM allocated statically in global
 | 
					// Maximum number of omp state objects per SM allocated statically in global
 | 
				
			||||||
// memory.
 | 
					// memory.
 | 
				
			||||||
#if __CUDA_ARCH__ >= 600
 | 
					#if __CUDA_ARCH__ >= 700
 | 
				
			||||||
 | 
					#define OMP_STATE_COUNT 32
 | 
				
			||||||
 | 
					#define MAX_SM 84
 | 
				
			||||||
 | 
					#elif __CUDA_ARCH__ >= 600
 | 
				
			||||||
#define OMP_STATE_COUNT 32
 | 
					#define OMP_STATE_COUNT 32
 | 
				
			||||||
#define MAX_SM 56
 | 
					#define MAX_SM 56
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue