HIP: remove the use of __HIP_PLATFORM_AMD__, explicitly support only AMD targets (#14945)

This commit is contained in:
uvos 2025-07-29 20:23:04 +02:00 committed by GitHub
parent b77d11179d
commit aa79524c51
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 67 additions and 65 deletions

View File

@ -176,7 +176,7 @@ static const char * cu_get_error_str(CUresult err) {
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str) #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
#endif #endif
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \ # define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
do { \ do { \
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \ static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \
@ -191,7 +191,7 @@ static const char * cu_get_error_str(CUresult err) {
do { \ do { \
GGML_UNUSED(nbytes); \ GGML_UNUSED(nbytes); \
} while (0) } while (0)
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) #endif // !(defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA) #if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
#define GGML_CUDA_ASSUME(x) __builtin_assume(x) #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
@ -211,9 +211,9 @@ typedef float2 dfloat2;
#define GGML_USE_VMM #define GGML_USE_VMM
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM)) #endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL #if defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
#define FP16_AVAILABLE #define FP16_AVAILABLE
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL #endif // defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
#define FAST_FP16_AVAILABLE #define FAST_FP16_AVAILABLE
@ -227,17 +227,17 @@ typedef float2 dfloat2;
#define FP16_MMA_AVAILABLE #define FP16_MMA_AVAILABLE
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4))) #endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA) #if defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
#define AMD_MFMA_AVAILABLE #define AMD_MFMA_AVAILABLE
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3) #endif // defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
#define NEW_MMA_AVAILABLE #define NEW_MMA_AVAILABLE
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
#define CP_ASYNC_AVAILABLE #define CP_ASYNC_AVAILABLE
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220) #if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
#define FLASH_ATTN_AVAILABLE #define FLASH_ATTN_AVAILABLE
@ -259,7 +259,7 @@ static bool fast_fp16_hardware_available(const int cc) {
// Any FP16 tensor core instructions are available for ggml code. // Any FP16 tensor core instructions are available for ggml code.
static bool fp16_mma_available(const int cc) { static bool fp16_mma_available(const int cc) {
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) #if defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
return false; return false;
#else #else
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) || if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
@ -275,7 +275,7 @@ static bool fp16_mma_available(const int cc) {
} else { } else {
return false; return false;
} }
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) #endif // defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
} }
// To be used for feature selection of external libraries, e.g. cuBLAS. // To be used for feature selection of external libraries, e.g. cuBLAS.
@ -312,25 +312,25 @@ static bool cp_async_available(const int cc) {
} }
static constexpr __device__ int ggml_cuda_get_physical_warp_size() { static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__)) #if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
return 64; return 64;
#else #else
return 32; return 32;
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__)) #endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
} }
[[noreturn]] [[noreturn]]
static __device__ void no_device_code( static __device__ void no_device_code(
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) { const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP)
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n", printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
file_name, line, function_name, arch); file_name, line, function_name, arch);
GGML_UNUSED(arch_list); GGML_UNUSED(arch_list);
#else #else
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n", printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
file_name, line, function_name, arch, arch_list); file_name, line, function_name, arch, arch_list);
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP)
__trap(); __trap();
GGML_UNUSED(no_device_code); // suppress unused function warning GGML_UNUSED(no_device_code); // suppress unused function warning
@ -367,7 +367,7 @@ struct ggml_cuda_unroll<1> {
template<int width = WARP_SIZE> template<int width = WARP_SIZE>
static __device__ __forceinline__ int warp_reduce_sum(int x) { static __device__ __forceinline__ int warp_reduce_sum(int x) {
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
return __reduce_add_sync(0xffffffff, x); return __reduce_add_sync(0xffffffff, x);
#else #else
#pragma unroll #pragma unroll
@ -375,7 +375,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
x += __shfl_xor_sync(0xffffffff, x, offset, width); x += __shfl_xor_sync(0xffffffff, x, offset, width);
} }
return x; return x;
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
} }
template<int width = WARP_SIZE> template<int width = WARP_SIZE>
@ -444,11 +444,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) { static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
#ifdef FP16_AVAILABLE #ifdef FP16_AVAILABLE
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX #if !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
return __float2half(fmaxf(__half2float(a), __half2float(b))); return __float2half(fmaxf(__half2float(a), __half2float(b)));
#else #else
return __hmax(a, b); return __hmax(a, b);
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX #endif // !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
#else #else
NO_DEVICE_CODE; NO_DEVICE_CODE;
@ -476,7 +476,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
template<int width = WARP_SIZE> template<int width = WARP_SIZE>
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000) #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
#pragma unroll #pragma unroll
for (int offset = width/2; offset > 0; offset >>= 1) { for (int offset = width/2; offset > 0; offset >>= 1) {
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width)); x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
@ -485,7 +485,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
#else #else
GGML_UNUSED(x); GGML_UNUSED(x);
NO_DEVICE_CODE; NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000) #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
} }
#if CUDART_VERSION < CUDART_HMASK #if CUDART_VERSION < CUDART_HMASK
@ -497,7 +497,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
#endif // CUDART_VERSION < CUDART_HMASK #endif // CUDART_VERSION < CUDART_HMASK
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) { static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP)
#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__) #if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
c = __builtin_amdgcn_sdot4(a, b, c, false); c = __builtin_amdgcn_sdot4(a, b, c, false);
#elif defined(RDNA3) || defined(RDNA4) #elif defined(RDNA3) || defined(RDNA4)
@ -523,7 +523,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
#endif #endif
return c; return c;
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #else // defined(GGML_USE_HIP)
#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA) #if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
return __dp4a(a, b, c); return __dp4a(a, b, c);
@ -533,7 +533,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3]; return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA) #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP)
} }
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v); typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);

View File

@ -592,9 +592,9 @@ static __global__ void flash_attn_stream_k_fixup(
} }
template<int D> // D == head size template<int D> // D == head size
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) #if !defined(GGML_USE_HIP)
__launch_bounds__(D, 1) __launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) #endif // !(defined(GGML_USE_HIP)
static __global__ void flash_attn_combine_results( static __global__ void flash_attn_combine_results(
const float * __restrict__ VKQ_parts, const float * __restrict__ VKQ_parts,
const float2 * __restrict__ VKQ_meta, const float2 * __restrict__ VKQ_meta,

View File

@ -1391,24 +1391,24 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
constexpr bool use_logit_softcap = false; constexpr bool use_logit_softcap = false;
fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>; fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>;
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
if (!shared_memory_limit_raised[id]) { if (!shared_memory_limit_raised[id]) {
CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total)); CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
shared_memory_limit_raised[id] = true; shared_memory_limit_raised[id] = true;
} }
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
} else { } else {
constexpr bool use_logit_softcap = true; constexpr bool use_logit_softcap = true;
fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>; fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>;
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
if (!shared_memory_limit_raised[id]) { if (!shared_memory_limit_raised[id]) {
CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total)); CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
shared_memory_limit_raised[id] = true; shared_memory_limit_raised[id] = true;
} }
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
} }
launch_fattn<DV, ncols1, ncols2> launch_fattn<DV, ncols1, ncols2>

View File

@ -5,9 +5,9 @@
#define FATTN_KQ_STRIDE_TILE_F16 64 #define FATTN_KQ_STRIDE_TILE_F16 64
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) #if !defined(GGML_USE_HIP)
__launch_bounds__(nwarps*WARP_SIZE, 2) __launch_bounds__(nwarps*WARP_SIZE, 2)
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) #endif // !defined(GGML_USE_HIP)
static __global__ void flash_attn_tile_ext_f16( static __global__ void flash_attn_tile_ext_f16(
const char * __restrict__ Q, const char * __restrict__ Q,
const char * __restrict__ K, const char * __restrict__ K,

View File

@ -5,9 +5,9 @@
#define FATTN_KQ_STRIDE_TILE_F32 32 #define FATTN_KQ_STRIDE_TILE_F32 32
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) #if !defined(GGML_USE_HIP)
__launch_bounds__(nwarps*WARP_SIZE, 2) __launch_bounds__(nwarps*WARP_SIZE, 2)
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) #endif // !defined(GGML_USE_HIP)
static __global__ void flash_attn_tile_ext_f32( static __global__ void flash_attn_tile_ext_f32(
const char * __restrict__ Q, const char * __restrict__ Q,
const char * __restrict__ K, const char * __restrict__ K,

View File

@ -7,7 +7,7 @@
#include "fattn-wmma-f16.cuh" #include "fattn-wmma-f16.cuh"
#ifdef FP16_MMA_AVAILABLE #ifdef FP16_MMA_AVAILABLE
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) #if !defined(GGML_USE_HIP)
#include <mma.h> #include <mma.h>
#ifdef GGML_USE_MUSA #ifdef GGML_USE_MUSA
namespace wmma = mtmusa::wmma; namespace wmma = mtmusa::wmma;
@ -18,7 +18,7 @@ namespace wmma = nvcuda::wmma;
#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers #undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
#include <rocwmma/rocwmma.hpp> #include <rocwmma/rocwmma.hpp>
namespace wmma = rocwmma; namespace wmma = rocwmma;
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) #endif // !defined(GGML_USE_HIP)
#endif // FP16_MMA_AVAILABLE #endif // FP16_MMA_AVAILABLE
// D == head size, VKQ_stride == num VKQ rows calculated in parallel: // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
@ -546,7 +546,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
return; return;
} }
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) #if !defined(GGML_USE_HIP)
if (Q->ne[1] <= 8 && Q->ne[0] % warp_size == 0) { if (Q->ne[1] <= 8 && Q->ne[0] % warp_size == 0) {
constexpr int cols_per_block = 8; constexpr int cols_per_block = 8;
switch (Q->ne[0]) { switch (Q->ne[0]) {
@ -568,7 +568,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
} }
return; return;
} }
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) #endif // !defined(GGML_USE_HIP)
if (Q->ne[1] <= 32) { if (Q->ne[1] <= 32) {
constexpr int cols_per_block = 16; constexpr int cols_per_block = 16;

View File

@ -128,7 +128,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
return err; return err;
} }
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP)
static int ggml_cuda_parse_id(char devName[]) { static int ggml_cuda_parse_id(char devName[]) {
// A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp // A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
// these values are not stable so this is susceptible to breakage // these values are not stable so this is susceptible to breakage
@ -175,10 +175,10 @@ static int ggml_cuda_parse_id(char devName[]) {
archNum += archMinor; archNum += archMinor;
return archNum; return archNum;
} }
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP)
static ggml_cuda_device_info ggml_cuda_init() { static ggml_cuda_device_info ggml_cuda_init() {
#ifdef __HIP_PLATFORM_AMD__ #if defined(GGML_USE_HIP)
// Workaround for a rocBLAS bug when using multiple graphics cards: // Workaround for a rocBLAS bug when using multiple graphics cards:
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
{ {
@ -251,7 +251,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].nsm = prop.multiProcessorCount; info.devices[id].nsm = prop.multiProcessorCount;
info.devices[id].smpb = prop.sharedMemPerBlock; info.devices[id].smpb = prop.sharedMemPerBlock;
info.devices[id].warp_size = prop.warpSize; info.devices[id].warp_size = prop.warpSize;
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP)
info.devices[id].smpbo = prop.sharedMemPerBlock; info.devices[id].smpbo = prop.sharedMemPerBlock;
info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName); info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
@ -281,7 +281,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].cc = 100*prop.major + 10*prop.minor; info.devices[id].cc = 100*prop.major + 10*prop.minor;
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP)
} }
for (int id = 0; id < info.device_count; ++id) { for (int id = 0; id < info.device_count; ++id) {

View File

@ -68,7 +68,7 @@ namespace ggml_cuda_mma {
static constexpr int I = I_; static constexpr int I = I_;
static constexpr int J = J_; static constexpr int J = J_;
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP)
static constexpr int ne = I * J / 64; static constexpr int ne = I * J / 64;
T x[ne] = {0}; T x[ne] = {0};
@ -132,7 +132,7 @@ namespace ggml_cuda_mma {
static_assert(I == -1 && J == -1, "template specialization not implemented"); static_assert(I == -1 && J == -1, "template specialization not implemented");
} }
} }
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP)
}; };
template <int I_, int J_> template <int I_, int J_>

View File

@ -104,9 +104,9 @@ static constexpr __device__ int get_mmq_x_max_device() {
return 128; return 128;
#else // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) #else // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP)
return 64; return 64;
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #else // defined(GGML_USE_HIP)
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#ifdef GGML_CUDA_FORCE_MMQ #ifdef GGML_CUDA_FORCE_MMQ
@ -118,7 +118,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
return 64; return 64;
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP)
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) #endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
} }
@ -128,7 +128,7 @@ static int get_mmq_y_host(const int cc) {
} }
static constexpr __device__ int get_mmq_y_device() { static constexpr __device__ int get_mmq_y_device() {
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP)
#if defined(RDNA1) #if defined(RDNA1)
return 64; return 64;
#else #else
@ -140,7 +140,7 @@ static constexpr __device__ int get_mmq_y_device() {
#else #else
return 64; return 64;
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP)
} }
// Decouple shared memory tile sizes from WARP_SIZE to allow for different warp sizes. // Decouple shared memory tile sizes from WARP_SIZE to allow for different warp sizes.
@ -250,7 +250,7 @@ static constexpr __device__ int mmq_get_granularity_device(const int /*mmq_x*/)
} }
#endif // AMD_MFMA_AVAILABLE #endif // AMD_MFMA_AVAILABLE
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP)
static int mmq_get_nwarps_host(const int cc) { static int mmq_get_nwarps_host(const int cc) {
return amd_mfma_available(cc) ? 8 : 4; return amd_mfma_available(cc) ? 8 : 4;
} }
@ -258,10 +258,10 @@ static int mmq_get_nwarps_host(const int cc) {
static int mmq_get_nwarps_host(const int /*cc*/) { static int mmq_get_nwarps_host(const int /*cc*/) {
return 8; return 8;
} }
#endif // (GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // (GGML_USE_HIP)
static constexpr __device__ int mmq_get_nwarps_device() { static constexpr __device__ int mmq_get_nwarps_device() {
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP)
#if defined(AMD_MFMA_AVAILABLE) #if defined(AMD_MFMA_AVAILABLE)
return 8; return 8;
#else #else
@ -269,7 +269,7 @@ static constexpr __device__ int mmq_get_nwarps_device() {
#endif // AMD_MFMA_AVAILABLE #endif // AMD_MFMA_AVAILABLE
#else #else
return 8; return 8;
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP)
} }
// ------------------------------------------------------------ // ------------------------------------------------------------
@ -3047,7 +3047,7 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(
// The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598 // The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
template <ggml_type type, int mmq_x, bool need_check> template <ggml_type type, int mmq_x, bool need_check>
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP)
#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN) #if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
__launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2) __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2)
#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN) #endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
@ -3057,7 +3057,7 @@ template <ggml_type type, int mmq_x, bool need_check>
#else #else
__launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2) __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2)
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP)
static __global__ void mul_mat_q( static __global__ void mul_mat_q(
const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst, const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup, const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
@ -3097,7 +3097,7 @@ static __global__ void mul_mat_q(
__syncthreads(); __syncthreads();
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead: // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA #if (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
{ {
const int wt = blockIdx.z / nchannels_y; const int wt = blockIdx.z / nchannels_y;
const int zt = blockIdx.z - wt*nchannels_y; const int zt = blockIdx.z - wt*nchannels_y;
@ -3151,7 +3151,7 @@ static __global__ void mul_mat_q(
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk); tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
return; return;
} }
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA #endif // (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
const int64_t blocks_per_ne00 = ncols_x / qk; const int64_t blocks_per_ne00 = ncols_x / qk;
constexpr int blocks_per_iter = MMQ_ITER_K / qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk;

View File

@ -5,10 +5,8 @@
#include <hipblas/hipblas.h> #include <hipblas/hipblas.h>
#include <hip/hip_fp16.h> #include <hip/hip_fp16.h>
#include <hip/hip_bfloat16.h> #include <hip/hip_bfloat16.h>
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize() // for rocblas_initialize()
#include "rocblas/rocblas.h" #include "rocblas/rocblas.h"
#endif // __HIP_PLATFORM_AMD__
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
@ -139,7 +137,7 @@
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION >= 70000000 #if HIP_VERSION >= 70000000
#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F #define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F #define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
@ -151,7 +149,11 @@
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
#define cublasComputeType_t hipblasDatatype_t #define cublasComputeType_t hipblasDatatype_t
#define cudaDataType_t hipblasDatatype_t #define cudaDataType_t hipblasDatatype_t
#endif #endif // HIP_VERSION >= 7000000
#if !defined(__HIP_PLATFORM_AMD__)
#error "The HIP backend supports only AMD targets"
#endif // !defined(__HIP_PLATFORM_AMD__)
#define __CUDA_ARCH__ 1300 #define __CUDA_ARCH__ 1300
@ -249,7 +251,7 @@ static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigne
return c; return c;
} }
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 #if HIP_VERSION < 50600000
// __shfl_xor() for half2 was added in ROCm 5.6 // __shfl_xor() for half2 was added in ROCm 5.6
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) { static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
typedef union half2_b32 { typedef union half2_b32 {
@ -261,4 +263,4 @@ static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width); tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
return tmp.val; return tmp.val;
} }
#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 #endif // HIP_VERSION < 50600000