Revert "Add mean_anyway to hpc config"
my bad, wrong repo ,so sorry.
This reverts commit 0b9350f3da
.
This commit is contained in:
parent
2a6701444a
commit
b09a5e5cb3
|
@ -16,7 +16,7 @@
|
||||||
#include "Utils.h"
|
#include "Utils.h"
|
||||||
|
|
||||||
#pragma omp begin declare target device_type(nohost)
|
#pragma omp begin declare target device_type(nohost)
|
||||||
extern const uint16_t __oclc_ABI_version;
|
|
||||||
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
||||||
|
|
||||||
using namespace _OMP;
|
using namespace _OMP;
|
||||||
|
|
|
@ -11,7 +11,6 @@
|
||||||
// identifier) and contains more up to date values for the enum checked here.
|
// identifier) and contains more up to date values for the enum checked here.
|
||||||
// rtl.cpp uses the system elf.h.
|
// rtl.cpp uses the system elf.h.
|
||||||
#include "llvm/BinaryFormat/ELF.h"
|
#include "llvm/BinaryFormat/ELF.h"
|
||||||
using namespace llvm::ELF;
|
|
||||||
|
|
||||||
const char *get_elf_mach_gfx_name(uint32_t EFlags) {
|
const char *get_elf_mach_gfx_name(uint32_t EFlags) {
|
||||||
using namespace llvm::ELF;
|
using namespace llvm::ELF;
|
||||||
|
@ -79,8 +78,3 @@ const char *get_elf_mach_gfx_name(uint32_t EFlags) {
|
||||||
return "--unknown gfx";
|
return "--unknown gfx";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint16_t implicitArgsSize(uint16_t Version) {
|
|
||||||
return Version < ELFABIVERSION_AMDGPU_HSA_V5 ? IMPLICITARGS::COV4_SIZE
|
|
||||||
: IMPLICITARGS::COV5_SIZE;
|
|
||||||
}
|
|
||||||
|
|
|
@ -12,49 +12,4 @@
|
||||||
|
|
||||||
const char *get_elf_mach_gfx_name(uint32_t EFlags);
|
const char *get_elf_mach_gfx_name(uint32_t EFlags);
|
||||||
|
|
||||||
enum IMPLICITARGS : uint16_t {
|
|
||||||
COV4_SIZE = 56,
|
|
||||||
COV4_HOSTCALL_PTR_OFFSET = 24,
|
|
||||||
HOSTCALL_PTR_SIZE = 8,
|
|
||||||
|
|
||||||
COV5_SIZE = 256,
|
|
||||||
|
|
||||||
COV5_BLOCK_COUNT_X_OFFSET = 0,
|
|
||||||
COV5_BLOCK_COUNT_X_SIZE = 4,
|
|
||||||
|
|
||||||
COV5_BLOCK_COUNT_Y_OFFSET = 4,
|
|
||||||
COV5_BLOCK_COUNT_Y_SIZE = 4,
|
|
||||||
|
|
||||||
COV5_BLOCK_COUNT_Z_OFFSET = 8,
|
|
||||||
COV5_BLOCK_COUNT_Z_SIZE = 4,
|
|
||||||
|
|
||||||
COV5_GROUP_SIZE_X_OFFSET = 12,
|
|
||||||
COV5_GROUP_SIZE_X_SIZE = 2,
|
|
||||||
|
|
||||||
COV5_GROUP_SIZE_Y_OFFSET = 14,
|
|
||||||
COV5_GROUP_SIZE_Y_SIZE = 2,
|
|
||||||
|
|
||||||
COV5_GROUP_SIZE_Z_OFFSET = 16,
|
|
||||||
COV5_GROUP_SIZE_Z_SIZE = 2,
|
|
||||||
|
|
||||||
COV5_REMAINDER_X_OFFSET = 18,
|
|
||||||
COV5_REMAINDER_X_SIZE = 2,
|
|
||||||
|
|
||||||
COV5_REMAINDER_Y_OFFSET = 20,
|
|
||||||
COV5_REMAINDER_Y_SIZE = 2,
|
|
||||||
|
|
||||||
COV5_REMAINDER_Z_OFFSET = 22,
|
|
||||||
COV5_REMAINDER_Z_SIZE = 2,
|
|
||||||
|
|
||||||
COV5_GRID_DIMS_OFFSET = 64,
|
|
||||||
COV5_GRID_DIMS_SIZE = 2,
|
|
||||||
|
|
||||||
COV5_HOSTCALL_PTR_OFFSET = 80,
|
|
||||||
|
|
||||||
COV5_HEAPV1_PTR_OFFSET = 96,
|
|
||||||
COV5_HEAPV1_PTR_SIZE = 8
|
|
||||||
};
|
|
||||||
|
|
||||||
const uint16_t implicitArgsSize(uint16_t Version);
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -33,6 +33,17 @@
|
||||||
|
|
||||||
#define MAX_NUM_KERNELS (1024 * 16)
|
#define MAX_NUM_KERNELS (1024 * 16)
|
||||||
|
|
||||||
|
typedef struct impl_implicit_args_s {
|
||||||
|
uint64_t offset_x;
|
||||||
|
uint64_t offset_y;
|
||||||
|
uint64_t offset_z;
|
||||||
|
uint64_t hostcall_ptr;
|
||||||
|
uint64_t unused0;
|
||||||
|
uint64_t unused1;
|
||||||
|
uint64_t unused2;
|
||||||
|
} impl_implicit_args_t;
|
||||||
|
static_assert(sizeof(impl_implicit_args_t) == 56, "");
|
||||||
|
|
||||||
// ---------------------- Kernel Start -------------
|
// ---------------------- Kernel Start -------------
|
||||||
typedef struct atl_kernel_info_s {
|
typedef struct atl_kernel_info_s {
|
||||||
uint64_t kernel_object;
|
uint64_t kernel_object;
|
||||||
|
|
|
@ -67,17 +67,6 @@ public:
|
||||||
HiddenMultiGridSyncArg,
|
HiddenMultiGridSyncArg,
|
||||||
HiddenHostcallBuffer,
|
HiddenHostcallBuffer,
|
||||||
HiddenHeapV1,
|
HiddenHeapV1,
|
||||||
HiddenBlockCountX,
|
|
||||||
HiddenBlockCountY,
|
|
||||||
HiddenBlockCountZ,
|
|
||||||
HiddenGroupSizeX,
|
|
||||||
HiddenGroupSizeY,
|
|
||||||
HiddenGroupSizeZ,
|
|
||||||
HiddenRemainderX,
|
|
||||||
HiddenRemainderY,
|
|
||||||
HiddenRemainderZ,
|
|
||||||
HiddenGridDims,
|
|
||||||
HiddenQueuePtr,
|
|
||||||
Unknown
|
Unknown
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -113,19 +102,7 @@ static const std::map<std::string, KernelArgMD::ValueKind> ArgValueKind = {
|
||||||
{"hidden_multigrid_sync_arg",
|
{"hidden_multigrid_sync_arg",
|
||||||
KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
|
KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
|
||||||
{"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
|
{"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
|
||||||
{"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1},
|
{"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1}};
|
||||||
{"hidden_block_count_x", KernelArgMD::ValueKind::HiddenBlockCountX},
|
|
||||||
{"hidden_block_count_y", KernelArgMD::ValueKind::HiddenBlockCountY},
|
|
||||||
{"hidden_block_count_z", KernelArgMD::ValueKind::HiddenBlockCountZ},
|
|
||||||
{"hidden_group_size_x", KernelArgMD::ValueKind::HiddenGroupSizeX},
|
|
||||||
{"hidden_group_size_y", KernelArgMD::ValueKind::HiddenGroupSizeY},
|
|
||||||
{"hidden_group_size_z", KernelArgMD::ValueKind::HiddenGroupSizeZ},
|
|
||||||
{"hidden_remainder_x", KernelArgMD::ValueKind::HiddenRemainderX},
|
|
||||||
{"hidden_remainder_y", KernelArgMD::ValueKind::HiddenRemainderY},
|
|
||||||
{"hidden_remainder_z", KernelArgMD::ValueKind::HiddenRemainderZ},
|
|
||||||
{"hidden_grid_dims", KernelArgMD::ValueKind::HiddenGridDims},
|
|
||||||
{"hidden_queue_ptr", KernelArgMD::ValueKind::HiddenQueuePtr},
|
|
||||||
};
|
|
||||||
|
|
||||||
namespace core {
|
namespace core {
|
||||||
|
|
||||||
|
@ -187,17 +164,6 @@ static bool isImplicit(KernelArgMD::ValueKind value_kind) {
|
||||||
case KernelArgMD::ValueKind::HiddenMultiGridSyncArg:
|
case KernelArgMD::ValueKind::HiddenMultiGridSyncArg:
|
||||||
case KernelArgMD::ValueKind::HiddenHostcallBuffer:
|
case KernelArgMD::ValueKind::HiddenHostcallBuffer:
|
||||||
case KernelArgMD::ValueKind::HiddenHeapV1:
|
case KernelArgMD::ValueKind::HiddenHeapV1:
|
||||||
case KernelArgMD::ValueKind::HiddenBlockCountX:
|
|
||||||
case KernelArgMD::ValueKind::HiddenBlockCountY:
|
|
||||||
case KernelArgMD::ValueKind::HiddenBlockCountZ:
|
|
||||||
case KernelArgMD::ValueKind::HiddenGroupSizeX:
|
|
||||||
case KernelArgMD::ValueKind::HiddenGroupSizeY:
|
|
||||||
case KernelArgMD::ValueKind::HiddenGroupSizeZ:
|
|
||||||
case KernelArgMD::ValueKind::HiddenRemainderX:
|
|
||||||
case KernelArgMD::ValueKind::HiddenRemainderY:
|
|
||||||
case KernelArgMD::ValueKind::HiddenRemainderZ:
|
|
||||||
case KernelArgMD::ValueKind::HiddenGridDims:
|
|
||||||
case KernelArgMD::ValueKind::HiddenQueuePtr:
|
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
@ -507,7 +473,8 @@ static hsa_status_t get_code_object_custom_metadata(
|
||||||
size_t new_offset = lcArg.offset_;
|
size_t new_offset = lcArg.offset_;
|
||||||
size_t padding = new_offset - offset;
|
size_t padding = new_offset - offset;
|
||||||
offset = new_offset;
|
offset = new_offset;
|
||||||
|
DP("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(), lcArg.size_,
|
||||||
|
lcArg.offset_);
|
||||||
offset += lcArg.size_;
|
offset += lcArg.size_;
|
||||||
|
|
||||||
// check if the arg is a hidden/implicit arg
|
// check if the arg is a hidden/implicit arg
|
||||||
|
@ -515,13 +482,9 @@ static hsa_status_t get_code_object_custom_metadata(
|
||||||
if (!isImplicit(lcArg.valueKind_)) {
|
if (!isImplicit(lcArg.valueKind_)) {
|
||||||
info.explicit_argument_count++;
|
info.explicit_argument_count++;
|
||||||
kernel_explicit_args_size += lcArg.size_;
|
kernel_explicit_args_size += lcArg.size_;
|
||||||
DP("Explicit Kernel Arg[%lu] \"%s\" (%u, %u)\n", i,
|
|
||||||
lcArg.name_.c_str(), lcArg.size_, lcArg.offset_);
|
|
||||||
} else {
|
} else {
|
||||||
info.implicit_argument_count++;
|
info.implicit_argument_count++;
|
||||||
hasHiddenArgs = true;
|
hasHiddenArgs = true;
|
||||||
DP("Implicit Kernel Arg[%lu] \"%s\" (%u, %u)\n", i,
|
|
||||||
lcArg.name_.c_str(), lcArg.size_, lcArg.offset_);
|
|
||||||
}
|
}
|
||||||
kernel_explicit_args_size += padding;
|
kernel_explicit_args_size += padding;
|
||||||
}
|
}
|
||||||
|
@ -529,7 +492,7 @@ static hsa_status_t get_code_object_custom_metadata(
|
||||||
|
|
||||||
// TODO: Probably don't want this arithmetic
|
// TODO: Probably don't want this arithmetic
|
||||||
info.kernel_segment_size =
|
info.kernel_segment_size =
|
||||||
(!hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size);
|
(hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size);
|
||||||
DP("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(),
|
DP("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(),
|
||||||
kernel_segment_size, info.kernel_segment_size);
|
kernel_segment_size, info.kernel_segment_size);
|
||||||
|
|
||||||
|
|
|
@ -124,10 +124,9 @@ public:
|
||||||
uint32_t KernargSegmentSize;
|
uint32_t KernargSegmentSize;
|
||||||
void *KernargRegion = nullptr;
|
void *KernargRegion = nullptr;
|
||||||
std::queue<int> FreeKernargSegments;
|
std::queue<int> FreeKernargSegments;
|
||||||
uint16_t CodeObjectVersion;
|
|
||||||
|
|
||||||
uint32_t kernargSizeIncludingImplicit() {
|
uint32_t kernargSizeIncludingImplicit() {
|
||||||
return KernargSegmentSize + implicitArgsSize(CodeObjectVersion);
|
return KernargSegmentSize + sizeof(impl_implicit_args_t);
|
||||||
}
|
}
|
||||||
|
|
||||||
~KernelArgPool() {
|
~KernelArgPool() {
|
||||||
|
@ -144,10 +143,8 @@ public:
|
||||||
KernelArgPool(const KernelArgPool &) = delete;
|
KernelArgPool(const KernelArgPool &) = delete;
|
||||||
KernelArgPool(KernelArgPool &&) = delete;
|
KernelArgPool(KernelArgPool &&) = delete;
|
||||||
|
|
||||||
KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool,
|
KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
|
||||||
uint16_t CodeObjectVersion)
|
: KernargSegmentSize(KernargSegmentSize) {
|
||||||
: KernargSegmentSize(KernargSegmentSize),
|
|
||||||
CodeObjectVersion(CodeObjectVersion) {
|
|
||||||
|
|
||||||
// impl uses one pool per kernel for all gpus, with a fixed upper size
|
// impl uses one pool per kernel for all gpus, with a fixed upper size
|
||||||
// preserving that exact scheme here, including the queue<int>
|
// preserving that exact scheme here, including the queue<int>
|
||||||
|
@ -231,16 +228,16 @@ struct KernelTy {
|
||||||
KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
|
KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
|
||||||
int32_t DeviceId, void *CallStackAddr, const char *Name,
|
int32_t DeviceId, void *CallStackAddr, const char *Name,
|
||||||
uint32_t KernargSegmentSize,
|
uint32_t KernargSegmentSize,
|
||||||
hsa_amd_memory_pool_t &KernArgMemoryPool, uint16_t CodeObjectVersion)
|
hsa_amd_memory_pool_t &KernArgMemoryPool)
|
||||||
: ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
|
: ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
|
||||||
DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
|
DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
|
||||||
DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
|
DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
|
||||||
|
|
||||||
std::string N(Name);
|
std::string N(Name);
|
||||||
if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
|
if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
|
||||||
KernelArgPoolMap.insert(std::make_pair(
|
KernelArgPoolMap.insert(
|
||||||
N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
|
std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
|
||||||
KernargSegmentSize, KernArgMemoryPool, CodeObjectVersion))));
|
KernargSegmentSize, KernArgMemoryPool))));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -477,7 +474,6 @@ public:
|
||||||
std::vector<int> WarpSize;
|
std::vector<int> WarpSize;
|
||||||
std::vector<std::string> GPUName;
|
std::vector<std::string> GPUName;
|
||||||
std::vector<std::string> TargetID;
|
std::vector<std::string> TargetID;
|
||||||
uint16_t CodeObjectVersion;
|
|
||||||
|
|
||||||
// OpenMP properties
|
// OpenMP properties
|
||||||
std::vector<int> NumTeams;
|
std::vector<int> NumTeams;
|
||||||
|
@ -491,7 +487,6 @@ public:
|
||||||
|
|
||||||
// Resource pools
|
// Resource pools
|
||||||
SignalPoolT FreeSignalPool;
|
SignalPoolT FreeSignalPool;
|
||||||
std::vector<void *> PreallocatedDeviceHeap;
|
|
||||||
|
|
||||||
bool HostcallRequired = false;
|
bool HostcallRequired = false;
|
||||||
|
|
||||||
|
@ -866,6 +861,7 @@ public:
|
||||||
"Unexpected device id!");
|
"Unexpected device id!");
|
||||||
FuncGblEntries[DeviceId].emplace_back();
|
FuncGblEntries[DeviceId].emplace_back();
|
||||||
FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
|
FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
|
||||||
|
// KernelArgPoolMap.clear();
|
||||||
E.Entries.clear();
|
E.Entries.clear();
|
||||||
E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
|
E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
|
||||||
}
|
}
|
||||||
|
@ -1036,7 +1032,6 @@ public:
|
||||||
SymbolInfoTable.resize(NumberOfDevices);
|
SymbolInfoTable.resize(NumberOfDevices);
|
||||||
DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
|
DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
|
||||||
DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
|
DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
|
||||||
PreallocatedDeviceHeap.resize(NumberOfDevices);
|
|
||||||
|
|
||||||
Err = setupDevicePools(HSAAgents);
|
Err = setupDevicePools(HSAAgents);
|
||||||
if (Err != HSA_STATUS_SUCCESS) {
|
if (Err != HSA_STATUS_SUCCESS) {
|
||||||
|
@ -1366,27 +1361,6 @@ static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
|
||||||
return PacketId;
|
return PacketId;
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint16_t getCodeObjectVersionFromELF(__tgt_device_image *Image) {
|
|
||||||
char *ImageBegin = (char *)Image->ImageStart;
|
|
||||||
size_t ImageSize = (char *)Image->ImageEnd - ImageBegin;
|
|
||||||
|
|
||||||
StringRef Buffer = StringRef(ImageBegin, ImageSize);
|
|
||||||
auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
|
|
||||||
/*InitContent=*/false);
|
|
||||||
if (!ElfOrErr) {
|
|
||||||
REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str());
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get())) {
|
|
||||||
auto Header = ELFObj->getELFFile().getHeader();
|
|
||||||
uint16_t Version = (uint8_t)(Header.e_ident[EI_ABIVERSION]);
|
|
||||||
DP("ELFABIVERSION Version: %u\n", Version);
|
|
||||||
return Version;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
|
int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
|
||||||
ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
|
ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
|
||||||
int32_t ThreadLimit, uint64_t LoopTripcount) {
|
int32_t ThreadLimit, uint64_t LoopTripcount) {
|
||||||
|
@ -1464,7 +1438,6 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
|
||||||
}
|
}
|
||||||
uint64_t PacketId = acquireAvailablePacketId(Queue);
|
uint64_t PacketId = acquireAvailablePacketId(Queue);
|
||||||
|
|
||||||
uint16_t CodeObjectVersion = DeviceInfo().CodeObjectVersion;
|
|
||||||
const uint32_t Mask = Queue->size - 1; // size is a power of 2
|
const uint32_t Mask = Queue->size - 1; // size is a power of 2
|
||||||
hsa_kernel_dispatch_packet_t *Packet =
|
hsa_kernel_dispatch_packet_t *Packet =
|
||||||
(hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
|
(hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
|
||||||
|
@ -2187,40 +2160,6 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
|
||||||
return Res;
|
return Res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void preAllocateHeapMemoryForCov5() {
|
|
||||||
void *DevPtr;
|
|
||||||
for (int I = 0; I < DeviceInfo().NumberOfDevices; I++) {
|
|
||||||
DevPtr = nullptr;
|
|
||||||
size_t PreAllocSize = 131072; // 128KB per device
|
|
||||||
|
|
||||||
hsa_amd_memory_pool_t MemoryPool =
|
|
||||||
DeviceInfo().DeviceCoarseGrainedMemoryPools[I];
|
|
||||||
hsa_status_t Err =
|
|
||||||
hsa_amd_memory_pool_allocate(MemoryPool, PreAllocSize, 0, &DevPtr);
|
|
||||||
if (Err != HSA_STATUS_SUCCESS) {
|
|
||||||
DP("Error allocating preallocated heap device memory: %s\n",
|
|
||||||
get_error_string(Err));
|
|
||||||
}
|
|
||||||
|
|
||||||
Err = hsa_amd_agents_allow_access(1, &DeviceInfo().HSAAgents[I], NULL,
|
|
||||||
DevPtr);
|
|
||||||
if (Err != HSA_STATUS_SUCCESS) {
|
|
||||||
DP("hsa allow_access_to_all_gpu_agents failed: %s\n",
|
|
||||||
get_error_string(Err));
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t Rounded =
|
|
||||||
sizeof(uint32_t) * ((PreAllocSize + 3) / sizeof(uint32_t));
|
|
||||||
Err = hsa_amd_memory_fill(DevPtr, 0, Rounded / sizeof(uint32_t));
|
|
||||||
if (Err != HSA_STATUS_SUCCESS) {
|
|
||||||
DP("Error zero-initializing preallocated heap device memory:%s\n",
|
|
||||||
get_error_string(Err));
|
|
||||||
}
|
|
||||||
|
|
||||||
DeviceInfo().PreallocatedDeviceHeap[I] = DevPtr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
|
__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
|
||||||
__tgt_device_image *Image) {
|
__tgt_device_image *Image) {
|
||||||
// This function loads the device image onto gpu[DeviceId] and does other
|
// This function loads the device image onto gpu[DeviceId] and does other
|
||||||
|
@ -2255,12 +2194,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
|
||||||
if (!elfMachineIdIsAmdgcn(Image))
|
if (!elfMachineIdIsAmdgcn(Image))
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
DeviceInfo().CodeObjectVersion = getCodeObjectVersionFromELF(Image);
|
|
||||||
if (DeviceInfo().CodeObjectVersion >=
|
|
||||||
llvm::ELF::ELFABIVERSION_AMDGPU_HSA_V5) {
|
|
||||||
preAllocateHeapMemoryForCov5();
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
{
|
||||||
auto Env =
|
auto Env =
|
||||||
DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices,
|
DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices,
|
||||||
|
@ -2584,8 +2517,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
|
||||||
|
|
||||||
KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
|
KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
|
||||||
CallStackAddr, E->name, KernargSegmentSize,
|
CallStackAddr, E->name, KernargSegmentSize,
|
||||||
DeviceInfo().KernArgPool,
|
DeviceInfo().KernArgPool));
|
||||||
DeviceInfo().CodeObjectVersion));
|
|
||||||
__tgt_offload_entry Entry = *E;
|
__tgt_offload_entry Entry = *E;
|
||||||
Entry.addr = (void *)&KernelsList.back();
|
Entry.addr = (void *)&KernelsList.back();
|
||||||
DeviceInfo().addOffloadEntry(DeviceId, Entry);
|
DeviceInfo().addOffloadEntry(DeviceId, Entry);
|
||||||
|
|
|
@ -100,7 +100,7 @@ function(libomp_get_ldflags ldflags)
|
||||||
libomp_append(ldflags_local "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}${LIBOMP_VERSION_MAJOR}.${LIBOMP_VERSION_MINOR}"
|
libomp_append(ldflags_local "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}${LIBOMP_VERSION_MAJOR}.${LIBOMP_VERSION_MINOR}"
|
||||||
IF_DEFINED CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG)
|
IF_DEFINED CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG)
|
||||||
libomp_append(ldflags_local -Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG)
|
libomp_append(ldflags_local -Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG)
|
||||||
libomp_append(ldflags_local "-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt -Wl,--undefined-version" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
|
libomp_append(ldflags_local "-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
|
||||||
libomp_append(ldflags_local "-Wl,--undefined-version" LIBOMP_HAVE_UNDEFINED_VERSION_FLAG) # FIXME issue #58858
|
libomp_append(ldflags_local "-Wl,--undefined-version" LIBOMP_HAVE_UNDEFINED_VERSION_FLAG) # FIXME issue #58858
|
||||||
libomp_append(ldflags_local -static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG)
|
libomp_append(ldflags_local -static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG)
|
||||||
libomp_append(ldflags_local -Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)
|
libomp_append(ldflags_local -Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)
|
||||||
|
|
|
@ -131,7 +131,7 @@ if(WIN32)
|
||||||
elseif(NOT APPLE)
|
elseif(NOT APPLE)
|
||||||
libomp_check_linker_flag(-Wl,-x LIBOMP_HAVE_X_FLAG)
|
libomp_check_linker_flag(-Wl,-x LIBOMP_HAVE_X_FLAG)
|
||||||
libomp_check_linker_flag(-Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG)
|
libomp_check_linker_flag(-Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG)
|
||||||
libomp_check_linker_flag("-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt -Wl,--undefined-version" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
|
libomp_check_linker_flag("-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
|
||||||
libomp_check_linker_flag("-Wl,--undefined-version" LIBOMP_HAVE_UNDEFINED_VERSION_FLAG) # FIXME issue #58858
|
libomp_check_linker_flag("-Wl,--undefined-version" LIBOMP_HAVE_UNDEFINED_VERSION_FLAG) # FIXME issue #58858
|
||||||
libomp_check_linker_flag(-static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG)
|
libomp_check_linker_flag(-static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG)
|
||||||
libomp_check_linker_flag(-Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)
|
libomp_check_linker_flag(-Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)
|
||||||
|
|
Loading…
Reference in New Issue