From aa79524c51fb014f8df17069d31d7c44b9ea6cb8 Mon Sep 17 00:00:00 2001 From: uvos Date: Tue, 29 Jul 2025 20:23:04 +0200 Subject: [PATCH] HIP: remove the use of __HIP_PLATFORM_AMD__, explicitly support only AMD targets (#14945) --- ggml/src/ggml-cuda/common.cuh | 50 ++++++++++++++-------------- ggml/src/ggml-cuda/fattn-common.cuh | 4 +-- ggml/src/ggml-cuda/fattn-mma-f16.cuh | 8 ++--- ggml/src/ggml-cuda/fattn-tile-f16.cu | 4 +-- ggml/src/ggml-cuda/fattn-tile-f32.cu | 4 +-- ggml/src/ggml-cuda/fattn-wmma-f16.cu | 8 ++--- ggml/src/ggml-cuda/ggml-cuda.cu | 10 +++--- ggml/src/ggml-cuda/mma.cuh | 4 +-- ggml/src/ggml-cuda/mmq.cuh | 26 +++++++-------- ggml/src/ggml-cuda/vendors/hip.h | 14 ++++---- 10 files changed, 67 insertions(+), 65 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 19fcc5982..44daafbf7 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -176,7 +176,7 @@ static const char * cu_get_error_str(CUresult err) { #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str) #endif -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) +#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) # define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \ do { \ static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \ @@ -191,7 +191,7 @@ static const char * cu_get_error_str(CUresult err) { do { \ GGML_UNUSED(nbytes); \ } while (0) -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) +#endif // !(defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) #if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA) #define GGML_CUDA_ASSUME(x) __builtin_assume(x) @@ -211,9 +211,9 @@ typedef float2 dfloat2; #define GGML_USE_VMM #endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM)) -#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL +#if defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL #define FP16_AVAILABLE -#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL +#endif // defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 #define FAST_FP16_AVAILABLE @@ -227,17 +227,17 @@ typedef float2 dfloat2; #define FP16_MMA_AVAILABLE #endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4))) -#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA) +#if defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA) #define AMD_MFMA_AVAILABLE -#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3) +#endif // defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA) -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING #define NEW_MMA_AVAILABLE -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING +#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #define CP_ASYNC_AVAILABLE -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE +#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220) #define FLASH_ATTN_AVAILABLE @@ -259,7 +259,7 @@ static bool fast_fp16_hardware_available(const int cc) { // Any FP16 tensor core instructions are available for ggml code. static bool fp16_mma_available(const int cc) { -#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) +#if defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN) return false; #else if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) || @@ -275,7 +275,7 @@ static bool fp16_mma_available(const int cc) { } else { return false; } -#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN) +#endif // defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN) } // To be used for feature selection of external libraries, e.g. cuBLAS. @@ -312,25 +312,25 @@ static bool cp_async_available(const int cc) { } static constexpr __device__ int ggml_cuda_get_physical_warp_size() { -#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__)) +#if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__)) return 64; #else return 32; -#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__)) +#endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__)) } [[noreturn]] static __device__ void no_device_code( const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) { -#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#if defined(GGML_USE_HIP) printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n", file_name, line, function_name, arch); GGML_UNUSED(arch_list); #else printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n", file_name, line, function_name, arch, arch_list); -#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#endif // defined(GGML_USE_HIP) __trap(); GGML_UNUSED(no_device_code); // suppress unused function warning @@ -367,7 +367,7 @@ struct ggml_cuda_unroll<1> { template static __device__ __forceinline__ int warp_reduce_sum(int x) { -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE return __reduce_add_sync(0xffffffff, x); #else #pragma unroll @@ -375,7 +375,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) { x += __shfl_xor_sync(0xffffffff, x, offset, width); } return x; -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE +#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE } template @@ -444,11 +444,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) { static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) { #ifdef FP16_AVAILABLE -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX +#if !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX return __float2half(fmaxf(__half2float(a), __half2float(b))); #else return __hmax(a, b); -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX +#endif // !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX #else NO_DEVICE_CODE; @@ -476,7 +476,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal template static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000) +#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000) #pragma unroll for (int offset = width/2; offset > 0; offset >>= 1) { x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width)); @@ -485,7 +485,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { #else GGML_UNUSED(x); NO_DEVICE_CODE; -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000) +#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000) } #if CUDART_VERSION < CUDART_HMASK @@ -497,7 +497,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half #endif // CUDART_VERSION < CUDART_HMASK static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) { -#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#if defined(GGML_USE_HIP) #if defined(CDNA) || defined(RDNA2) || defined(__gfx906__) c = __builtin_amdgcn_sdot4(a, b, c, false); #elif defined(RDNA3) || defined(RDNA4) @@ -523,7 +523,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i #endif return c; -#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#else // defined(GGML_USE_HIP) #if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA) return __dp4a(a, b, c); @@ -533,7 +533,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3]; #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA) -#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#endif // defined(GGML_USE_HIP) } typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v); diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index 95e704e39..0cc74f284 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -592,9 +592,9 @@ static __global__ void flash_attn_stream_k_fixup( } template // D == head size -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) +#if !defined(GGML_USE_HIP) __launch_bounds__(D, 1) -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) +#endif // !(defined(GGML_USE_HIP) static __global__ void flash_attn_combine_results( const float * __restrict__ VKQ_parts, const float2 * __restrict__ VKQ_meta, diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 83cf872f6..8e847d361 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -1391,24 +1391,24 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml constexpr bool use_logit_softcap = false; fattn_kernel = flash_attn_ext_f16; -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) +#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; if (!shared_memory_limit_raised[id]) { CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total)); shared_memory_limit_raised[id] = true; } -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) +#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) } else { constexpr bool use_logit_softcap = true; fattn_kernel = flash_attn_ext_f16; -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) +#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; if (!shared_memory_limit_raised[id]) { CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total)); shared_memory_limit_raised[id] = true; } -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA) +#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) } launch_fattn diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu index 7661c21ef..678288c13 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f16.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu @@ -5,9 +5,9 @@ #define FATTN_KQ_STRIDE_TILE_F16 64 template // D == head size -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) +#if !defined(GGML_USE_HIP) __launch_bounds__(nwarps*WARP_SIZE, 2) -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) +#endif // !defined(GGML_USE_HIP) static __global__ void flash_attn_tile_ext_f16( const char * __restrict__ Q, const char * __restrict__ K, diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu index 11778bb96..bc283d9a7 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f32.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu @@ -5,9 +5,9 @@ #define FATTN_KQ_STRIDE_TILE_F32 32 template // D == head size -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) +#if !defined(GGML_USE_HIP) __launch_bounds__(nwarps*WARP_SIZE, 2) -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) +#endif // !defined(GGML_USE_HIP) static __global__ void flash_attn_tile_ext_f32( const char * __restrict__ Q, const char * __restrict__ K, diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu index c9b083bed..f4393ec57 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu @@ -7,7 +7,7 @@ #include "fattn-wmma-f16.cuh" #ifdef FP16_MMA_AVAILABLE -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) +#if !defined(GGML_USE_HIP) #include #ifdef GGML_USE_MUSA namespace wmma = mtmusa::wmma; @@ -18,7 +18,7 @@ namespace wmma = nvcuda::wmma; #undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers #include namespace wmma = rocwmma; -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) +#endif // !defined(GGML_USE_HIP) #endif // FP16_MMA_AVAILABLE // D == head size, VKQ_stride == num VKQ rows calculated in parallel: @@ -546,7 +546,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten return; } -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) +#if !defined(GGML_USE_HIP) if (Q->ne[1] <= 8 && Q->ne[0] % warp_size == 0) { constexpr int cols_per_block = 8; switch (Q->ne[0]) { @@ -568,7 +568,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten } return; } -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) +#endif // !defined(GGML_USE_HIP) if (Q->ne[1] <= 32) { constexpr int cols_per_block = 16; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 819c2be78..517927946 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -128,7 +128,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) return err; } -#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#if defined(GGML_USE_HIP) static int ggml_cuda_parse_id(char devName[]) { // A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp // these values are not stable so this is susceptible to breakage @@ -175,10 +175,10 @@ static int ggml_cuda_parse_id(char devName[]) { archNum += archMinor; return archNum; } -#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#endif // defined(GGML_USE_HIP) static ggml_cuda_device_info ggml_cuda_init() { -#ifdef __HIP_PLATFORM_AMD__ +#if defined(GGML_USE_HIP) // Workaround for a rocBLAS bug when using multiple graphics cards: // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 { @@ -251,7 +251,7 @@ static ggml_cuda_device_info ggml_cuda_init() { info.devices[id].nsm = prop.multiProcessorCount; info.devices[id].smpb = prop.sharedMemPerBlock; info.devices[id].warp_size = prop.warpSize; -#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#if defined(GGML_USE_HIP) info.devices[id].smpbo = prop.sharedMemPerBlock; info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName); @@ -281,7 +281,7 @@ static ggml_cuda_device_info ggml_cuda_init() { info.devices[id].cc = 100*prop.major + 10*prop.minor; GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); -#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#endif // defined(GGML_USE_HIP) } for (int id = 0; id < info.device_count; ++id) { diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index d6817d804..a86365c6a 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -68,7 +68,7 @@ namespace ggml_cuda_mma { static constexpr int I = I_; static constexpr int J = J_; -#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#if defined(GGML_USE_HIP) static constexpr int ne = I * J / 64; T x[ne] = {0}; @@ -132,7 +132,7 @@ namespace ggml_cuda_mma { static_assert(I == -1 && J == -1, "template specialization not implemented"); } } -#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#endif // defined(GGML_USE_HIP) }; template diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 36e84be15..d8650dd70 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -104,9 +104,9 @@ static constexpr __device__ int get_mmq_x_max_device() { return 128; #else // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) -#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#if defined(GGML_USE_HIP) return 64; -#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#else // defined(GGML_USE_HIP) #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #ifdef GGML_CUDA_FORCE_MMQ @@ -118,7 +118,7 @@ static constexpr __device__ int get_mmq_x_max_device() { return 64; #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA -#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#endif // defined(GGML_USE_HIP) #endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE) } @@ -128,7 +128,7 @@ static int get_mmq_y_host(const int cc) { } static constexpr __device__ int get_mmq_y_device() { -#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#if defined(GGML_USE_HIP) #if defined(RDNA1) return 64; #else @@ -140,7 +140,7 @@ static constexpr __device__ int get_mmq_y_device() { #else return 64; #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA -#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#endif // defined(GGML_USE_HIP) } // Decouple shared memory tile sizes from WARP_SIZE to allow for different warp sizes. @@ -250,7 +250,7 @@ static constexpr __device__ int mmq_get_granularity_device(const int /*mmq_x*/) } #endif // AMD_MFMA_AVAILABLE -#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#if defined(GGML_USE_HIP) static int mmq_get_nwarps_host(const int cc) { return amd_mfma_available(cc) ? 8 : 4; } @@ -258,10 +258,10 @@ static int mmq_get_nwarps_host(const int cc) { static int mmq_get_nwarps_host(const int /*cc*/) { return 8; } -#endif // (GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#endif // (GGML_USE_HIP) static constexpr __device__ int mmq_get_nwarps_device() { -#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#if defined(GGML_USE_HIP) #if defined(AMD_MFMA_AVAILABLE) return 8; #else @@ -269,7 +269,7 @@ static constexpr __device__ int mmq_get_nwarps_device() { #endif // AMD_MFMA_AVAILABLE #else return 8; -#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#endif // defined(GGML_USE_HIP) } // ------------------------------------------------------------ @@ -3047,7 +3047,7 @@ static __device__ __forceinline__ void mul_mat_q_process_tile( // The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598 template -#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#if defined(GGML_USE_HIP) #if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN) __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2) #endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN) @@ -3057,7 +3057,7 @@ template #else __launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2) #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA -#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) +#endif // defined(GGML_USE_HIP) static __global__ void mul_mat_q( const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst, const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup, @@ -3097,7 +3097,7 @@ static __global__ void mul_mat_q( __syncthreads(); // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead: -#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA +#if (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA { const int wt = blockIdx.z / nchannels_y; const int zt = blockIdx.z - wt*nchannels_y; @@ -3151,7 +3151,7 @@ static __global__ void mul_mat_q( tile_x_max_i, tile_y_max_j, 0, ncols_x/qk); return; } -#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA +#endif // (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA const int64_t blocks_per_ne00 = ncols_x / qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk; diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 56e59a058..8b172e60f 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -5,10 +5,8 @@ #include #include #include -#ifdef __HIP_PLATFORM_AMD__ // for rocblas_initialize() #include "rocblas/rocblas.h" -#endif // __HIP_PLATFORM_AMD__ #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT @@ -139,7 +137,7 @@ #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED -#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION >= 70000000 +#if HIP_VERSION >= 70000000 #define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F #define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F @@ -151,7 +149,11 @@ #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F #define cublasComputeType_t hipblasDatatype_t #define cudaDataType_t hipblasDatatype_t -#endif +#endif // HIP_VERSION >= 7000000 + +#if !defined(__HIP_PLATFORM_AMD__) +#error "The HIP backend supports only AMD targets" +#endif // !defined(__HIP_PLATFORM_AMD__) #define __CUDA_ARCH__ 1300 @@ -249,7 +251,7 @@ static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigne return c; } -#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 +#if HIP_VERSION < 50600000 // __shfl_xor() for half2 was added in ROCm 5.6 static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) { typedef union half2_b32 { @@ -261,4 +263,4 @@ static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int tmp.b32 = __shfl_xor(tmp.b32, laneMask, width); return tmp.val; } -#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000 +#endif // HIP_VERSION < 50600000