4311 lines
196 KiB
C
4311 lines
196 KiB
C
#define GGML_COMMON_IMPL_C
|
|
#include "ggml-common.h"
|
|
#include "ggml-quants.h"
|
|
#include "ggml-impl.h"
|
|
#include "ggml-cpu.h"
|
|
|
|
#include "../../quants.h"
|
|
#include "../../ggml-cpu-impl.h"
|
|
|
|
#include <math.h>
|
|
#include <string.h>
|
|
#include <assert.h>
|
|
#include <stdlib.h> // for qsort
|
|
#include <stdio.h> // for GGML_ASSERT
|
|
|
|
#define GROUP_MAX_EPS 1e-15f
|
|
#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
|
|
#define GROUP_MAX_EPS_IQ2_S 1e-8f
|
|
#define GROUP_MAX_EPS_IQ1_M 1e-7f
|
|
#define GROUP_MAX_EPS_IQ1_S 1e-12f
|
|
|
|
#define UNUSED GGML_UNUSED
|
|
|
|
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
|
|
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
|
|
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
|
// multiply int8_t, add results pairwise twice
|
|
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
|
// Get absolute values of x vectors
|
|
const __m128i ax = _mm_sign_epi8(x, x);
|
|
// Sign the values of the y vectors
|
|
const __m128i sy = _mm_sign_epi8(y, x);
|
|
// Perform multiplication and create 16-bit values
|
|
const __m128i dot = _mm_maddubs_epi16(ax, sy);
|
|
const __m128i ones = _mm_set1_epi16(1);
|
|
return _mm_madd_epi16(ones, dot);
|
|
}
|
|
|
|
#if __AVX__ || __AVX2__ || __AVX512F__
|
|
// horizontally add 8 floats
|
|
static inline float hsum_float_8(const __m256 x) {
|
|
__m128 res = _mm256_extractf128_ps(x, 1);
|
|
res = _mm_add_ps(res, _mm256_castps256_ps128(x));
|
|
res = _mm_add_ps(res, _mm_movehl_ps(res, res));
|
|
res = _mm_add_ss(res, _mm_movehdup_ps(res));
|
|
return _mm_cvtss_f32(res);
|
|
}
|
|
|
|
// horizontally add 8 int32_t
|
|
static inline int hsum_i32_8(const __m256i a) {
|
|
const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
|
|
const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
|
|
const __m128i sum64 = _mm_add_epi32(hi64, sum128);
|
|
const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
|
|
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
|
|
}
|
|
|
|
// horizontally add 4 int32_t
|
|
static inline int hsum_i32_4(const __m128i a) {
|
|
const __m128i hi64 = _mm_unpackhi_epi64(a, a);
|
|
const __m128i sum64 = _mm_add_epi32(hi64, a);
|
|
const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
|
|
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
|
|
}
|
|
|
|
#if defined(__AVX2__) || defined(__AVX512F__)
|
|
// spread 32 bits to 32 bytes { 0x00, 0xFF }
|
|
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
|
uint32_t x32;
|
|
memcpy(&x32, x, sizeof(uint32_t));
|
|
const __m256i shuf_mask = _mm256_set_epi64x(
|
|
0x0303030303030303, 0x0202020202020202,
|
|
0x0101010101010101, 0x0000000000000000);
|
|
__m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
|
|
const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
|
|
bytes = _mm256_or_si256(bytes, bit_mask);
|
|
return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
|
|
}
|
|
|
|
// Unpack 32 4-bit fields into 32 bytes
|
|
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
|
|
static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
|
|
{
|
|
const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
|
|
const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
|
|
const __m256i lowMask = _mm256_set1_epi8( 0xF );
|
|
return _mm256_and_si256(lowMask, bytes);
|
|
}
|
|
|
|
// add int16_t pairwise and return as float vector
|
|
static inline __m256 sum_i16_pairs_float(const __m256i x) {
|
|
const __m256i ones = _mm256_set1_epi16(1);
|
|
const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
|
|
return _mm256_cvtepi32_ps(summed_pairs);
|
|
}
|
|
|
|
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
|
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
|
|
const __m256i zero = _mm256_setzero_si256();
|
|
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
|
return _mm256_cvtepi32_ps(summed_pairs);
|
|
#elif defined(__AVXVNNI__)
|
|
const __m256i zero = _mm256_setzero_si256();
|
|
const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
|
|
return _mm256_cvtepi32_ps(summed_pairs);
|
|
#else
|
|
// Perform multiplication and create 16-bit values
|
|
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
|
return sum_i16_pairs_float(dot);
|
|
#endif
|
|
}
|
|
|
|
// multiply int8_t, add results pairwise twice and return as float vector
|
|
static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
|
|
#if __AVXVNNIINT8__
|
|
const __m256i zero = _mm256_setzero_si256();
|
|
const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
|
|
return _mm256_cvtepi32_ps(summed_pairs);
|
|
#else
|
|
// Get absolute values of x vectors
|
|
const __m256i ax = _mm256_sign_epi8(x, x);
|
|
// Sign the values of the y vectors
|
|
const __m256i sy = _mm256_sign_epi8(y, x);
|
|
return mul_sum_us8_pairs_float(ax, sy);
|
|
#endif
|
|
}
|
|
|
|
static inline __m128i packNibbles( __m256i bytes )
|
|
{
|
|
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
|
#if __AVX512F__
|
|
const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000
|
|
bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh
|
|
return _mm256_cvtepi16_epi8(bytes); // abcd_efgh
|
|
#else
|
|
const __m256i lowByte = _mm256_set1_epi16( 0xFF );
|
|
__m256i high = _mm256_andnot_si256( lowByte, bytes );
|
|
__m256i low = _mm256_and_si256( lowByte, bytes );
|
|
high = _mm256_srli_epi16( high, 4 );
|
|
bytes = _mm256_or_si256( low, high );
|
|
|
|
// Compress uint16_t lanes into bytes
|
|
__m128i r0 = _mm256_castsi256_si128( bytes );
|
|
__m128i r1 = _mm256_extracti128_si256( bytes, 1 );
|
|
return _mm_packus_epi16( r0, r1 );
|
|
#endif
|
|
}
|
|
#elif defined(__AVX__)
|
|
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
|
|
{
|
|
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
|
const __m128i lowByte = _mm_set1_epi16( 0xFF );
|
|
__m128i high = _mm_andnot_si128( lowByte, bytes1 );
|
|
__m128i low = _mm_and_si128( lowByte, bytes1 );
|
|
high = _mm_srli_epi16( high, 4 );
|
|
bytes1 = _mm_or_si128( low, high );
|
|
high = _mm_andnot_si128( lowByte, bytes2 );
|
|
low = _mm_and_si128( lowByte, bytes2 );
|
|
high = _mm_srli_epi16( high, 4 );
|
|
bytes2 = _mm_or_si128( low, high );
|
|
|
|
return _mm_packus_epi16( bytes1, bytes2);
|
|
}
|
|
|
|
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
|
|
const __m128i ax = _mm_sign_epi8(x, x);
|
|
const __m128i sy = _mm_sign_epi8(y, x);
|
|
return _mm_maddubs_epi16(ax, sy);
|
|
}
|
|
|
|
// spread 32 bits to 32 bytes { 0x00, 0xFF }
|
|
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
|
uint32_t x32;
|
|
memcpy(&x32, x, sizeof(uint32_t));
|
|
const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
|
|
const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
|
|
__m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
|
|
__m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
|
|
const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
|
|
bytesl = _mm_or_si128(bytesl, bit_mask);
|
|
bytesh = _mm_or_si128(bytesh, bit_mask);
|
|
bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
|
|
bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
|
|
return MM256_SET_M128I(bytesh, bytesl);
|
|
}
|
|
|
|
// Unpack 32 4-bit fields into 32 bytes
|
|
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
|
|
static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
|
|
{
|
|
// Load 16 bytes from memory
|
|
__m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
|
|
__m128i tmph = _mm_srli_epi16(tmpl, 4);
|
|
const __m128i lowMask = _mm_set1_epi8(0xF);
|
|
tmpl = _mm_and_si128(lowMask, tmpl);
|
|
tmph = _mm_and_si128(lowMask, tmph);
|
|
return MM256_SET_M128I(tmph, tmpl);
|
|
}
|
|
|
|
// add int16_t pairwise and return as float vector
|
|
static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
|
|
const __m128i ones = _mm_set1_epi16(1);
|
|
const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
|
|
const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
|
|
const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
|
|
return _mm256_cvtepi32_ps(summed_pairs);
|
|
}
|
|
|
|
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
|
const __m128i axl = _mm256_castsi256_si128(ax);
|
|
const __m128i axh = _mm256_extractf128_si256(ax, 1);
|
|
const __m128i syl = _mm256_castsi256_si128(sy);
|
|
const __m128i syh = _mm256_extractf128_si256(sy, 1);
|
|
// Perform multiplication and create 16-bit values
|
|
const __m128i dotl = _mm_maddubs_epi16(axl, syl);
|
|
const __m128i doth = _mm_maddubs_epi16(axh, syh);
|
|
return sum_i16_pairs_float(doth, dotl);
|
|
}
|
|
|
|
// multiply int8_t, add results pairwise twice and return as float vector
|
|
static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
|
|
const __m128i xl = _mm256_castsi256_si128(x);
|
|
const __m128i xh = _mm256_extractf128_si256(x, 1);
|
|
const __m128i yl = _mm256_castsi256_si128(y);
|
|
const __m128i yh = _mm256_extractf128_si256(y, 1);
|
|
// Get absolute values of x vectors
|
|
const __m128i axl = _mm_sign_epi8(xl, xl);
|
|
const __m128i axh = _mm_sign_epi8(xh, xh);
|
|
// Sign the values of the y vectors
|
|
const __m128i syl = _mm_sign_epi8(yl, xl);
|
|
const __m128i syh = _mm_sign_epi8(yh, xh);
|
|
// Perform multiplication and create 16-bit values
|
|
const __m128i dotl = _mm_maddubs_epi16(axl, syl);
|
|
const __m128i doth = _mm_maddubs_epi16(axh, syh);
|
|
return sum_i16_pairs_float(doth, dotl);
|
|
}
|
|
|
|
// larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors
|
|
static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_1_1, const __m128i x_2_0, const __m128i x_2_1,
|
|
const __m128i y_1_0, const __m128i y_1_1, const __m128i y_2_0, const __m128i y_2_1) {
|
|
const __m128i mone = _mm_set1_epi16(1);
|
|
|
|
const __m128i p16_1_0 = mul_add_epi8_sse(x_1_0, y_1_0);
|
|
const __m128i p16_1_1 = mul_add_epi8_sse(x_1_1, y_1_1);
|
|
const __m128i p16_2_0 = mul_add_epi8_sse(x_2_0, y_2_0);
|
|
const __m128i p16_2_1 = mul_add_epi8_sse(x_2_1, y_2_1);
|
|
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
|
|
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
|
|
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
|
|
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
|
|
const __m128i p_1 = _mm_add_epi32(p_1_0, p_1_1);
|
|
const __m128i p_2 = _mm_add_epi32(p_2_0, p_2_1);
|
|
return _mm256_cvtepi32_ps(MM256_SET_M128I(p_2, p_1));
|
|
}
|
|
|
|
// quad fp16 delta calculation
|
|
static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) {
|
|
// GGML_FP16_TO_FP32 is faster than Intel F16C
|
|
return _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x1) * GGML_FP16_TO_FP32(y1)),
|
|
_mm_set1_ps(GGML_FP16_TO_FP32(x0) * GGML_FP16_TO_FP32(y0)));
|
|
}
|
|
#endif
|
|
#elif defined(__SSSE3__)
|
|
// horizontally add 4x4 floats
|
|
static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
|
|
__m128 res_0 =_mm_hadd_ps(a, b);
|
|
__m128 res_1 =_mm_hadd_ps(c, d);
|
|
__m128 res =_mm_hadd_ps(res_0, res_1);
|
|
res =_mm_hadd_ps(res, res);
|
|
res =_mm_hadd_ps(res, res);
|
|
|
|
return _mm_cvtss_f32(res);
|
|
}
|
|
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
|
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
|
|
|
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
assert(QK8_0 == 32);
|
|
assert(k % QK8_0 == 0);
|
|
const int nb = k / QK8_0;
|
|
|
|
block_q8_0 * GGML_RESTRICT y = vy;
|
|
|
|
#if defined(__AVX2__) || defined(__AVX__)
|
|
for (int i = 0; i < nb; i++) {
|
|
// Load elements into 4 AVX vectors
|
|
__m256 v0 = _mm256_loadu_ps( x );
|
|
__m256 v1 = _mm256_loadu_ps( x + 8 );
|
|
__m256 v2 = _mm256_loadu_ps( x + 16 );
|
|
__m256 v3 = _mm256_loadu_ps( x + 24 );
|
|
x += 32;
|
|
|
|
// Compute max(abs(e)) for the block
|
|
const __m256 signBit = _mm256_set1_ps( -0.0f );
|
|
__m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
|
|
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
|
|
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
|
|
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
|
|
|
|
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
|
|
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
|
|
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
|
|
const float maxScalar = _mm_cvtss_f32( max4 );
|
|
|
|
// Quantize these floats
|
|
const float d = maxScalar / 127.f;
|
|
y[i].d = GGML_FP32_TO_FP16(d);
|
|
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
|
|
const __m256 mul = _mm256_set1_ps( id );
|
|
|
|
// Apply the multiplier
|
|
v0 = _mm256_mul_ps( v0, mul );
|
|
v1 = _mm256_mul_ps( v1, mul );
|
|
v2 = _mm256_mul_ps( v2, mul );
|
|
v3 = _mm256_mul_ps( v3, mul );
|
|
|
|
// Round to nearest integer
|
|
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
|
|
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
|
|
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
|
|
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
|
|
|
|
// Convert floats to integers
|
|
__m256i i0 = _mm256_cvtps_epi32( v0 );
|
|
__m256i i1 = _mm256_cvtps_epi32( v1 );
|
|
__m256i i2 = _mm256_cvtps_epi32( v2 );
|
|
__m256i i3 = _mm256_cvtps_epi32( v3 );
|
|
|
|
#if defined(__AVX2__)
|
|
// Convert int32 to int16
|
|
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
|
|
i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31
|
|
// Convert int16 to int8
|
|
i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
|
|
|
|
// We got our precious signed bytes, but the order is now wrong
|
|
// These AVX2 pack instructions process 16-byte pieces independently
|
|
// The following instruction is fixing the order
|
|
const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
|
|
i0 = _mm256_permutevar8x32_epi32( i0, perm );
|
|
|
|
_mm256_storeu_si256((__m256i *)y[i].qs, i0);
|
|
#else
|
|
// Since we don't have in AVX some necessary functions,
|
|
// we split the registers in half and call AVX2 analogs from SSE
|
|
__m128i ni0 = _mm256_castsi256_si128( i0 );
|
|
__m128i ni1 = _mm256_extractf128_si256( i0, 1);
|
|
__m128i ni2 = _mm256_castsi256_si128( i1 );
|
|
__m128i ni3 = _mm256_extractf128_si256( i1, 1);
|
|
__m128i ni4 = _mm256_castsi256_si128( i2 );
|
|
__m128i ni5 = _mm256_extractf128_si256( i2, 1);
|
|
__m128i ni6 = _mm256_castsi256_si128( i3 );
|
|
__m128i ni7 = _mm256_extractf128_si256( i3, 1);
|
|
|
|
// Convert int32 to int16
|
|
ni0 = _mm_packs_epi32( ni0, ni1 );
|
|
ni2 = _mm_packs_epi32( ni2, ni3 );
|
|
ni4 = _mm_packs_epi32( ni4, ni5 );
|
|
ni6 = _mm_packs_epi32( ni6, ni7 );
|
|
// Convert int16 to int8
|
|
ni0 = _mm_packs_epi16( ni0, ni2 );
|
|
ni4 = _mm_packs_epi16( ni4, ni6 );
|
|
|
|
_mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
|
|
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
|
|
#endif
|
|
}
|
|
#else
|
|
GGML_UNUSED(nb);
|
|
// scalar
|
|
quantize_row_q8_0_ref(x, y, k);
|
|
#endif
|
|
}
|
|
|
|
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
assert(k % QK8_1 == 0);
|
|
const int nb = k / QK8_1;
|
|
|
|
block_q8_1 * GGML_RESTRICT y = vy;
|
|
#if defined(__AVX2__) || defined(__AVX__)
|
|
for (int i = 0; i < nb; i++) {
|
|
// Load elements into 4 AVX vectors
|
|
__m256 v0 = _mm256_loadu_ps( x );
|
|
__m256 v1 = _mm256_loadu_ps( x + 8 );
|
|
__m256 v2 = _mm256_loadu_ps( x + 16 );
|
|
__m256 v3 = _mm256_loadu_ps( x + 24 );
|
|
x += 32;
|
|
|
|
// Compute max(abs(e)) for the block
|
|
const __m256 signBit = _mm256_set1_ps( -0.0f );
|
|
__m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
|
|
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
|
|
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
|
|
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
|
|
|
|
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
|
|
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
|
|
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
|
|
const float max_scalar = _mm_cvtss_f32( max4 );
|
|
|
|
// Quantize these floats
|
|
const float d = max_scalar / 127.f;
|
|
y[i].d = GGML_FP32_TO_FP16(d);
|
|
const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
|
|
const __m256 mul = _mm256_set1_ps( id );
|
|
|
|
// Apply the multiplier
|
|
v0 = _mm256_mul_ps( v0, mul );
|
|
v1 = _mm256_mul_ps( v1, mul );
|
|
v2 = _mm256_mul_ps( v2, mul );
|
|
v3 = _mm256_mul_ps( v3, mul );
|
|
|
|
// Round to nearest integer
|
|
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
|
|
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
|
|
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
|
|
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
|
|
|
|
// Convert floats to integers
|
|
__m256i i0 = _mm256_cvtps_epi32( v0 );
|
|
__m256i i1 = _mm256_cvtps_epi32( v1 );
|
|
__m256i i2 = _mm256_cvtps_epi32( v2 );
|
|
__m256i i3 = _mm256_cvtps_epi32( v3 );
|
|
|
|
#if defined(__AVX2__)
|
|
// Compute the sum of the quants and set y[i].s
|
|
y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
|
|
|
|
// Convert int32 to int16
|
|
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
|
|
i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31
|
|
// Convert int16 to int8
|
|
i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
|
|
|
|
// We got our precious signed bytes, but the order is now wrong
|
|
// These AVX2 pack instructions process 16-byte pieces independently
|
|
// The following instruction is fixing the order
|
|
const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
|
|
i0 = _mm256_permutevar8x32_epi32( i0, perm );
|
|
|
|
_mm256_storeu_si256((__m256i *)y[i].qs, i0);
|
|
#else
|
|
// Since we don't have in AVX some necessary functions,
|
|
// we split the registers in half and call AVX2 analogs from SSE
|
|
__m128i ni0 = _mm256_castsi256_si128( i0 );
|
|
__m128i ni1 = _mm256_extractf128_si256( i0, 1);
|
|
__m128i ni2 = _mm256_castsi256_si128( i1 );
|
|
__m128i ni3 = _mm256_extractf128_si256( i1, 1);
|
|
__m128i ni4 = _mm256_castsi256_si128( i2 );
|
|
__m128i ni5 = _mm256_extractf128_si256( i2, 1);
|
|
__m128i ni6 = _mm256_castsi256_si128( i3 );
|
|
__m128i ni7 = _mm256_extractf128_si256( i3, 1);
|
|
|
|
// Compute the sum of the quants and set y[i].s
|
|
const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
|
|
const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
|
|
y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
|
|
|
|
// Convert int32 to int16
|
|
ni0 = _mm_packs_epi32( ni0, ni1 );
|
|
ni2 = _mm_packs_epi32( ni2, ni3 );
|
|
ni4 = _mm_packs_epi32( ni4, ni5 );
|
|
ni6 = _mm_packs_epi32( ni6, ni7 );
|
|
// Convert int16 to int8
|
|
ni0 = _mm_packs_epi16( ni0, ni2 );
|
|
ni4 = _mm_packs_epi16( ni4, ni6 );
|
|
|
|
_mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
|
|
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
|
|
#endif
|
|
}
|
|
#else
|
|
GGML_UNUSED(nb);
|
|
// scalar
|
|
quantize_row_q8_1_ref(x, y, k);
|
|
#endif
|
|
}
|
|
|
|
// placeholder implementation for Apple targets
|
|
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
|
quantize_row_q8_K_ref(x, y, k);
|
|
}
|
|
|
|
//===================================== Dot products =================================
|
|
|
|
//
|
|
// Helper functions
|
|
//
|
|
|
|
#if __AVX__ || __AVX2__ || __AVX512F__
|
|
|
|
// shuffles to pick the required scales in dot products
|
|
static inline __m256i get_scale_shuffle_q3k(int i) {
|
|
static const uint8_t k_shuffle[128] = {
|
|
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
|
|
4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
|
|
8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
|
|
12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
|
|
};
|
|
return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
|
|
}
|
|
static inline __m256i get_scale_shuffle_k4(int i) {
|
|
static const uint8_t k_shuffle[256] = {
|
|
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
|
|
2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
|
|
4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
|
|
6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
|
|
8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
|
|
10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
|
|
12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
|
|
14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
|
|
};
|
|
return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
|
|
}
|
|
static inline __m128i get_scale_shuffle(int i) {
|
|
static const uint8_t k_shuffle[128] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
|
|
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
|
|
6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
|
|
8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
|
|
10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
|
|
12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
|
|
14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
|
|
};
|
|
return _mm_loadu_si128((const __m128i*)k_shuffle + i);
|
|
}
|
|
#endif
|
|
|
|
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
const int qk = QK8_0;
|
|
const int nb = n / qk;
|
|
|
|
assert(n % qk == 0);
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const block_q4_0 * GGML_RESTRICT x = vx;
|
|
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
|
|
int ib = 0;
|
|
float sumf = 0;
|
|
|
|
#if defined(__AVX2__)
|
|
// Initialize accumulator with zeros
|
|
__m256 acc = _mm256_setzero_ps();
|
|
|
|
// Main loop
|
|
for (; ib < nb; ++ib) {
|
|
/* Compute combined scale for the block */
|
|
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
|
|
|
|
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
|
|
|
|
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
|
const __m256i off = _mm256_set1_epi8( 8 );
|
|
qx = _mm256_sub_epi8( qx, off );
|
|
|
|
__m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
|
|
|
|
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
|
|
|
/* Multiply q with scale and accumulate */
|
|
acc = _mm256_fmadd_ps( d, q, acc );
|
|
}
|
|
|
|
sumf = hsum_float_8(acc);
|
|
#elif defined(__AVX__)
|
|
__m256 accum = _mm256_setzero_ps();
|
|
for (; ib + 1 < nb; ib += 2) {
|
|
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
|
|
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
|
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
|
|
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
|
|
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
|
|
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
|
|
|
|
const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8));
|
|
const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
|
|
const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
|
|
const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
|
|
|
|
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
|
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
|
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
|
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
|
const __m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1);
|
|
const __m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1);
|
|
const __m256 p = sum_i16_pairs_float(p_2, p_1);
|
|
|
|
const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
|
|
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
|
|
}
|
|
|
|
sumf = hsum_float_8(accum);
|
|
#elif defined(__SSSE3__)
|
|
// set constants
|
|
const __m128i lowMask = _mm_set1_epi8(0xF);
|
|
const __m128i off = _mm_set1_epi8(8);
|
|
|
|
// Initialize accumulator with zeros
|
|
__m128 acc_0 = _mm_setzero_ps();
|
|
__m128 acc_1 = _mm_setzero_ps();
|
|
__m128 acc_2 = _mm_setzero_ps();
|
|
__m128 acc_3 = _mm_setzero_ps();
|
|
|
|
for (; ib + 1 < nb; ib += 2) {
|
|
_mm_prefetch(&x[ib] + sizeof(block_q4_0), _MM_HINT_T0);
|
|
_mm_prefetch(&y[ib] + sizeof(block_q8_0), _MM_HINT_T0);
|
|
|
|
// Compute combined scale for the block 0 and 1
|
|
const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
|
|
|
|
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[ib].qs);
|
|
|
|
__m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
|
|
__m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
|
|
bx_0 = _mm_sub_epi8(bx_0, off);
|
|
const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
|
|
|
|
__m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
|
|
__m128i by_1 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16));
|
|
bx_1 = _mm_sub_epi8(bx_1, off);
|
|
const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
|
|
|
|
_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
|
|
_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
|
|
|
|
// Compute combined scale for the block 2 and 3
|
|
const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
|
|
|
|
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
|
|
|
__m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
|
|
__m128i by_2 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
|
|
bx_2 = _mm_sub_epi8(bx_2, off);
|
|
const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
|
|
|
|
__m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
|
|
__m128i by_3 = _mm_loadu_si128((const __m128i *)(y[ib + 1].qs + 16));
|
|
bx_3 = _mm_sub_epi8(bx_3, off);
|
|
const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
|
|
|
|
// Convert int32_t to float
|
|
__m128 p0 = _mm_cvtepi32_ps(i32_0);
|
|
__m128 p1 = _mm_cvtepi32_ps(i32_1);
|
|
__m128 p2 = _mm_cvtepi32_ps(i32_2);
|
|
__m128 p3 = _mm_cvtepi32_ps(i32_3);
|
|
|
|
// Apply the scale
|
|
__m128 p0_d = _mm_mul_ps( d_0_1, p0 );
|
|
__m128 p1_d = _mm_mul_ps( d_0_1, p1 );
|
|
__m128 p2_d = _mm_mul_ps( d_2_3, p2 );
|
|
__m128 p3_d = _mm_mul_ps( d_2_3, p3 );
|
|
|
|
// Acummulate
|
|
acc_0 = _mm_add_ps(p0_d, acc_0);
|
|
acc_1 = _mm_add_ps(p1_d, acc_1);
|
|
acc_2 = _mm_add_ps(p2_d, acc_2);
|
|
acc_3 = _mm_add_ps(p3_d, acc_3);
|
|
}
|
|
|
|
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
|
|
|
#endif
|
|
for (; ib < nb; ++ib) {
|
|
int sumi0 = 0;
|
|
int sumi1 = 0;
|
|
|
|
for (int j = 0; j < qk/2; ++j) {
|
|
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
|
|
const int v1 = (x[ib].qs[j] >> 4) - 8;
|
|
|
|
sumi0 += (v0 * y[ib].qs[j]);
|
|
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
|
}
|
|
|
|
int sumi = sumi0 + sumi1;
|
|
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
|
|
}
|
|
|
|
*s = sumf;
|
|
}
|
|
|
|
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
const int qk = QK8_1;
|
|
const int nb = n / qk;
|
|
|
|
assert(n % qk == 0);
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const block_q4_1 * GGML_RESTRICT x = vx;
|
|
const block_q8_1 * GGML_RESTRICT y = vy;
|
|
|
|
int ib = 0;
|
|
float sumf = 0;
|
|
|
|
#if defined(__AVX2__) || defined(__AVX__)
|
|
// Initialize accumulator with zeros
|
|
__m256 acc = _mm256_setzero_ps();
|
|
|
|
float summs = 0;
|
|
|
|
// Main loop
|
|
for (; ib < nb; ++ib) {
|
|
const float d0 = GGML_FP16_TO_FP32(x[ib].d);
|
|
const float d1 = GGML_FP16_TO_FP32(y[ib].d);
|
|
|
|
summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
|
|
|
|
const __m256 d0v = _mm256_set1_ps( d0 );
|
|
const __m256 d1v = _mm256_set1_ps( d1 );
|
|
|
|
// Compute combined scales
|
|
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
|
|
|
|
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
|
const __m256i qx = bytes_from_nibbles_32(x[ib].qs);
|
|
const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[ib].qs );
|
|
|
|
const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
|
|
|
|
// Accumulate d0*d1*x*y
|
|
#if defined(__AVX2__)
|
|
acc = _mm256_fmadd_ps( d0d1, xy, acc );
|
|
#else
|
|
acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
|
|
#endif
|
|
}
|
|
|
|
sumf = hsum_float_8(acc) + summs;
|
|
|
|
#endif
|
|
for (; ib < nb; ++ib) {
|
|
int sumi0 = 0;
|
|
int sumi1 = 0;
|
|
|
|
for (int j = 0; j < qk/2; ++j) {
|
|
const int v0 = (x[ib].qs[j] & 0x0F);
|
|
const int v1 = (x[ib].qs[j] >> 4);
|
|
|
|
sumi0 += (v0 * y[ib].qs[j]);
|
|
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
|
}
|
|
|
|
int sumi = sumi0 + sumi1;
|
|
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
|
}
|
|
|
|
*s = sumf;
|
|
}
|
|
|
|
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
const int qk = QK8_0;
|
|
const int nb = n / qk;
|
|
|
|
int ib = 0;
|
|
float sumf = 0;
|
|
|
|
assert(n % qk == 0);
|
|
assert(qk == QK5_0);
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const block_q5_0 * GGML_RESTRICT x = vx;
|
|
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
|
|
#if defined(__AVX2__)
|
|
// Initialize accumulator with zeros
|
|
__m256 acc = _mm256_setzero_ps();
|
|
|
|
// Main loop
|
|
for (; ib < nb; ++ib) {
|
|
/* Compute combined scale for the block */
|
|
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
|
|
|
|
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
|
|
__m256i bxhi = bytes_from_bits_32(x[ib].qh);
|
|
bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
|
|
qx = _mm256_or_si256(qx, bxhi);
|
|
|
|
__m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
|
|
|
|
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
|
|
|
/* Multiply q with scale and accumulate */
|
|
acc = _mm256_fmadd_ps(d, q, acc);
|
|
}
|
|
|
|
sumf = hsum_float_8(acc);
|
|
#elif defined(__AVX__)
|
|
// Initialize accumulator with zeros
|
|
__m256 acc = _mm256_setzero_ps();
|
|
__m128i mask = _mm_set1_epi8((char)0xF0);
|
|
|
|
// Main loop
|
|
for (; ib < nb; ++ib) {
|
|
/* Compute combined scale for the block */
|
|
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
|
|
|
|
__m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
|
|
const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
|
|
__m128i bxhil = _mm256_castsi256_si128(bxhi);
|
|
__m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
|
|
bxhil = _mm_andnot_si128(bxhil, mask);
|
|
bxhih = _mm_andnot_si128(bxhih, mask);
|
|
__m128i bxl = _mm256_castsi256_si128(bx_0);
|
|
__m128i bxh = _mm256_extractf128_si256(bx_0, 1);
|
|
bxl = _mm_or_si128(bxl, bxhil);
|
|
bxh = _mm_or_si128(bxh, bxhih);
|
|
bx_0 = MM256_SET_M128I(bxh, bxl);
|
|
|
|
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
|
|
|
|
const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0);
|
|
|
|
/* Multiply q with scale and accumulate */
|
|
acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
|
|
}
|
|
|
|
sumf = hsum_float_8(acc);
|
|
|
|
#endif
|
|
for (; ib < nb; ++ib) {
|
|
uint32_t qh;
|
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
|
|
|
int sumi0 = 0;
|
|
int sumi1 = 0;
|
|
|
|
for (int j = 0; j < qk/2; ++j) {
|
|
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
|
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
|
|
|
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
|
|
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
|
|
|
|
sumi0 += (x0 * y[ib].qs[j]);
|
|
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
|
}
|
|
|
|
int sumi = sumi0 + sumi1;
|
|
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
}
|
|
|
|
*s = sumf;
|
|
}
|
|
|
|
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
const int qk = QK8_1;
|
|
const int nb = n / qk;
|
|
|
|
int ib = 0;
|
|
float sumf = 0;
|
|
|
|
assert(n % qk == 0);
|
|
assert(qk == QK5_1);
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const block_q5_1 * GGML_RESTRICT x = vx;
|
|
const block_q8_1 * GGML_RESTRICT y = vy;
|
|
|
|
#if defined(__AVX2__)
|
|
// Initialize accumulator with zeros
|
|
__m256 acc = _mm256_setzero_ps();
|
|
|
|
float summs = 0.0f;
|
|
|
|
// Main loop
|
|
for (; ib < nb; ++ib) {
|
|
const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d));
|
|
|
|
summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
|
|
|
|
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
|
|
__m256i bxhi = bytes_from_bits_32(x[ib].qh);
|
|
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
|
qx = _mm256_or_si256(qx, bxhi);
|
|
|
|
const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d));
|
|
const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
|
|
|
|
const __m256 q = mul_sum_us8_pairs_float(qx, qy);
|
|
|
|
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
|
|
}
|
|
|
|
sumf = hsum_float_8(acc) + summs;
|
|
#elif defined(__AVX__)
|
|
// Initialize accumulator with zeros
|
|
__m256 acc = _mm256_setzero_ps();
|
|
__m128i mask = _mm_set1_epi8(0x10);
|
|
|
|
float summs = 0.0f;
|
|
|
|
// Main loop
|
|
for (; ib < nb; ++ib) {
|
|
const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d));
|
|
|
|
summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
|
|
|
|
__m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
|
|
const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
|
|
__m128i bxhil = _mm256_castsi256_si128(bxhi);
|
|
__m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
|
|
bxhil = _mm_and_si128(bxhil, mask);
|
|
bxhih = _mm_and_si128(bxhih, mask);
|
|
__m128i bxl = _mm256_castsi256_si128(bx_0);
|
|
__m128i bxh = _mm256_extractf128_si256(bx_0, 1);
|
|
bxl = _mm_or_si128(bxl, bxhil);
|
|
bxh = _mm_or_si128(bxh, bxhih);
|
|
bx_0 = MM256_SET_M128I(bxh, bxl);
|
|
|
|
const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d));
|
|
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
|
|
|
|
const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
|
|
|
|
acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
|
|
}
|
|
|
|
sumf = hsum_float_8(acc) + summs;
|
|
|
|
#endif
|
|
for (; ib < nb; ++ib) {
|
|
uint32_t qh;
|
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
|
|
|
int sumi0 = 0;
|
|
int sumi1 = 0;
|
|
|
|
for (int j = 0; j < qk/2; ++j) {
|
|
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
|
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
|
|
|
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
|
|
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
|
|
|
|
sumi0 += (x0 * y[ib].qs[j]);
|
|
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
|
}
|
|
|
|
int sumi = sumi0 + sumi1;
|
|
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
|
}
|
|
|
|
*s = sumf;
|
|
}
|
|
|
|
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
const int qk = QK8_0;
|
|
const int nb = n / qk;
|
|
|
|
assert(n % qk == 0);
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const block_q8_0 * GGML_RESTRICT x = vx;
|
|
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
|
|
int ib = 0;
|
|
float sumf = 0;
|
|
|
|
#if defined(__AVX2__)
|
|
// Initialize accumulator with zeros
|
|
__m256 acc = _mm256_setzero_ps();
|
|
|
|
// Main loop
|
|
for (; ib < nb; ++ib) {
|
|
// Compute combined scale for the block
|
|
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
|
|
__m256i qx = _mm256_loadu_si256((const __m256i *)x[ib].qs);
|
|
__m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
|
|
|
|
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
|
|
|
// Multiply q with scale and accumulate
|
|
acc = _mm256_fmadd_ps( d, q, acc );
|
|
}
|
|
|
|
sumf = hsum_float_8(acc);
|
|
#elif defined(__AVX__)
|
|
__m256 accum = _mm256_setzero_ps();
|
|
|
|
for (; ib + 1 < nb; ib += 2) {
|
|
const __m128i qx_1_0 = _mm_loadu_si128((const __m128i *)x[ib].qs);
|
|
const __m128i qx_1_1 = _mm_loadu_si128((const __m128i *)x[ib].qs + 1);
|
|
const __m128i qx_2_0 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
|
const __m128i qx_2_1 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs + 1);
|
|
const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
|
|
const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *)y[ib].qs + 1);
|
|
const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
|
|
const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
|
|
|
|
const __m256 p = mul_sum_i8_quad_float(qx_1_0, qx_1_1, qx_2_0, qx_2_1, qy_1_0, qy_1_1, qy_2_0, qy_2_1);
|
|
const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
|
|
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
|
|
}
|
|
|
|
sumf = hsum_float_8(accum);
|
|
|
|
#endif
|
|
for (; ib < nb; ++ib) {
|
|
int sumi = 0;
|
|
|
|
for (int j = 0; j < qk; j++) {
|
|
sumi += x[ib].qs[j]*y[ib].qs[j];
|
|
}
|
|
|
|
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
|
|
}
|
|
|
|
*s = sumf;
|
|
}
|
|
|
|
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const block_tq1_0 * GGML_RESTRICT x = vx;
|
|
const block_q8_K * GGML_RESTRICT y = vy;
|
|
|
|
const int nb = n / QK_K;
|
|
|
|
#if defined(__AVX2__)
|
|
__m256 sumf = _mm256_setzero_ps();
|
|
|
|
for (int i = 0; i < nb; ++i) {
|
|
// 16-bit sums
|
|
__m256i sumi0 = _mm256_setzero_si256();
|
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
__m256i sumi2 = _mm256_setzero_si256();
|
|
|
|
// first 32 bytes of 5 elements
|
|
{
|
|
__m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs));
|
|
// 8-bit multiplies with shifts, masks and adds
|
|
__m256i qx1 = _mm256_add_epi8(qx0, _mm256_add_epi8(qx0, qx0)); // 1 * 3
|
|
__m256i qx2 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx0, 3), _mm256_set1_epi8(-8)), qx0); // 1 * 9
|
|
__m256i qx3 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx1, 3), _mm256_set1_epi8(-8)), qx1); // 3 * 9
|
|
__m256i qx4 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx2, 3), _mm256_set1_epi8(-8)), qx2); // 9 * 9
|
|
|
|
// TODO: can _mm256_mulhi_epu16 be faster even if 16-bits?
|
|
|
|
// Cancel the +1 from avg so that it behaves like a halving add
|
|
qx0 = _mm256_subs_epu8(qx0, _mm256_set1_epi8(1));
|
|
qx1 = _mm256_subs_epu8(qx1, _mm256_set1_epi8(1));
|
|
qx2 = _mm256_subs_epu8(qx2, _mm256_set1_epi8(1));
|
|
qx3 = _mm256_subs_epu8(qx3, _mm256_set1_epi8(1));
|
|
qx4 = _mm256_subs_epu8(qx4, _mm256_set1_epi8(1));
|
|
// Multiply by 3 and get the top 2 bits
|
|
qx0 = _mm256_avg_epu8(qx0, _mm256_avg_epu8(qx0, _mm256_setzero_si256()));
|
|
qx1 = _mm256_avg_epu8(qx1, _mm256_avg_epu8(qx1, _mm256_setzero_si256()));
|
|
qx2 = _mm256_avg_epu8(qx2, _mm256_avg_epu8(qx2, _mm256_setzero_si256()));
|
|
qx3 = _mm256_avg_epu8(qx3, _mm256_avg_epu8(qx3, _mm256_setzero_si256()));
|
|
qx4 = _mm256_avg_epu8(qx4, _mm256_avg_epu8(qx4, _mm256_setzero_si256()));
|
|
qx0 = _mm256_and_si256(_mm256_srli_epi16(qx0, 6), _mm256_set1_epi8(3));
|
|
qx1 = _mm256_and_si256(_mm256_srli_epi16(qx1, 6), _mm256_set1_epi8(3));
|
|
qx2 = _mm256_and_si256(_mm256_srli_epi16(qx2, 6), _mm256_set1_epi8(3));
|
|
qx3 = _mm256_and_si256(_mm256_srli_epi16(qx3, 6), _mm256_set1_epi8(3));
|
|
qx4 = _mm256_and_si256(_mm256_srli_epi16(qx4, 6), _mm256_set1_epi8(3));
|
|
|
|
const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 0));
|
|
const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 32));
|
|
const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 64));
|
|
const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 96));
|
|
const __m256i qy4 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 128));
|
|
|
|
qx0 = _mm256_maddubs_epi16(qx0, qy0);
|
|
qx1 = _mm256_maddubs_epi16(qx1, qy1);
|
|
qx2 = _mm256_maddubs_epi16(qx2, qy2);
|
|
qx3 = _mm256_maddubs_epi16(qx3, qy3);
|
|
qx4 = _mm256_maddubs_epi16(qx4, qy4);
|
|
|
|
sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
|
|
sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
|
|
sumi2 = _mm256_add_epi16(sumi2, qx4);
|
|
}
|
|
|
|
// last 16 bytes of 5-element, along with the 4 bytes of 4 elements
|
|
{
|
|
__m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].qs + 32));
|
|
uint32_t qh;
|
|
memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned
|
|
__m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qh));
|
|
__m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3
|
|
__m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9
|
|
__m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9
|
|
__m128i qx4 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx2, 3), _mm_set1_epi8(-8)), qx2); // 9 * 9
|
|
__m256i qx01 = MM256_SET_M128I(qx1, qx0);
|
|
__m256i qx23 = MM256_SET_M128I(qx3, qx2);
|
|
|
|
// avx2 does not have 8-bit multiplies, so 16-bit it is.
|
|
qx5_l = _mm256_mullo_epi16(qx5_l, _mm256_set_epi16(27, 27, 27, 27, 9, 9, 9, 9, 3, 3, 3, 3, 1, 1, 1, 1));
|
|
qx5_l = _mm256_and_si256(qx5_l, _mm256_set1_epi16(0xFF));
|
|
__m128i qx5 = _mm_packus_epi16(_mm256_castsi256_si128(qx5_l), _mm256_extracti128_si256(qx5_l, 1));
|
|
|
|
__m256i qx45 = MM256_SET_M128I(qx5, qx4);
|
|
|
|
// Cancel the +1 from avg so that it behaves like a halving add
|
|
qx01 = _mm256_subs_epu8(qx01, _mm256_set1_epi8(1));
|
|
qx23 = _mm256_subs_epu8(qx23, _mm256_set1_epi8(1));
|
|
qx45 = _mm256_subs_epu8(qx45, _mm256_set1_epi8(1));
|
|
// Multiply by 3 and get the top 2 bits
|
|
qx01 = _mm256_avg_epu8(qx01, _mm256_avg_epu8(qx01, _mm256_setzero_si256()));
|
|
qx23 = _mm256_avg_epu8(qx23, _mm256_avg_epu8(qx23, _mm256_setzero_si256()));
|
|
qx45 = _mm256_avg_epu8(qx45, _mm256_avg_epu8(qx45, _mm256_setzero_si256()));
|
|
qx01 = _mm256_and_si256(_mm256_srli_epi16(qx01, 6), _mm256_set1_epi8(3));
|
|
qx23 = _mm256_and_si256(_mm256_srli_epi16(qx23, 6), _mm256_set1_epi8(3));
|
|
qx45 = _mm256_and_si256(_mm256_srli_epi16(qx45, 6), _mm256_set1_epi8(3));
|
|
|
|
const __m256i qy01 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 160));
|
|
const __m256i qy23 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 192));
|
|
const __m256i qy45 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 224));
|
|
|
|
qx01 = _mm256_maddubs_epi16(qx01, qy01);
|
|
qx23 = _mm256_maddubs_epi16(qx23, qy23);
|
|
qx45 = _mm256_maddubs_epi16(qx45, qy45);
|
|
|
|
sumi0 = _mm256_add_epi16(sumi0, qx01);
|
|
sumi1 = _mm256_add_epi16(sumi1, qx23);
|
|
sumi2 = _mm256_add_epi16(sumi2, qx45);
|
|
}
|
|
|
|
const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
|
|
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
|
|
|
|
sumi0 = _mm256_sub_epi16(sumi0, ysum);
|
|
sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2));
|
|
sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
|
|
|
|
sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
|
|
}
|
|
|
|
*s = hsum_float_8(sumf);
|
|
|
|
#else
|
|
const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
|
|
|
|
float sumf = 0.0f;
|
|
|
|
for (int i = 0; i < nb; ++i) {
|
|
int sum = 0;
|
|
|
|
for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
|
|
for (size_t l = 0; l < 5; ++l) {
|
|
for (size_t m = 0; m < 32; ++m) {
|
|
uint8_t q = x[i].qs[j + m] * pow3[l];
|
|
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
|
sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
|
|
}
|
|
}
|
|
}
|
|
for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
|
|
for (size_t l = 0; l < 5; ++l) {
|
|
for (size_t m = 0; m < 16; ++m) {
|
|
uint8_t q = x[i].qs[j + m] * pow3[l];
|
|
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
|
sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
|
|
}
|
|
}
|
|
}
|
|
|
|
for (size_t l = 0; l < 4; ++l) {
|
|
for (size_t j = 0; j < sizeof(x->qh); ++j) {
|
|
uint8_t q = x[i].qh[j] * pow3[l];
|
|
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
|
sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
|
|
}
|
|
}
|
|
|
|
sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d);
|
|
}
|
|
|
|
*s = sumf;
|
|
#endif
|
|
}
|
|
|
|
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const block_tq2_0 * GGML_RESTRICT x = vx;
|
|
const block_q8_K * GGML_RESTRICT y = vy;
|
|
|
|
const int nb = n / QK_K;
|
|
|
|
#if defined(__AVX2__)
|
|
__m256 sumf = _mm256_setzero_ps();
|
|
|
|
for (int i = 0; i < nb; ++i) {
|
|
// 16-bit sums, because 256*127 still fits
|
|
__m256i sumi0 = _mm256_setzero_si256();
|
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
|
|
for (size_t j = 0; j < sizeof(x->qs); j += 32) {
|
|
__m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs + j));
|
|
__m256i qx1 = _mm256_srli_epi16(qx0, 2);
|
|
__m256i qx2 = _mm256_srli_epi16(qx0, 4);
|
|
__m256i qx3 = _mm256_srli_epi16(qx0, 6);
|
|
|
|
// 0, 1, 2 (should not be 3)
|
|
qx0 = _mm256_and_si256(qx0, _mm256_set1_epi8(3));
|
|
qx1 = _mm256_and_si256(qx1, _mm256_set1_epi8(3));
|
|
qx2 = _mm256_and_si256(qx2, _mm256_set1_epi8(3));
|
|
qx3 = _mm256_and_si256(qx3, _mm256_set1_epi8(3));
|
|
|
|
const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 0));
|
|
const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 32));
|
|
const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 64));
|
|
const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 96));
|
|
|
|
qx0 = _mm256_maddubs_epi16(qx0, qy0);
|
|
qx1 = _mm256_maddubs_epi16(qx1, qy1);
|
|
qx2 = _mm256_maddubs_epi16(qx2, qy2);
|
|
qx3 = _mm256_maddubs_epi16(qx3, qy3);
|
|
|
|
sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
|
|
sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
|
|
}
|
|
|
|
const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
|
|
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
|
|
|
|
sumi0 = _mm256_add_epi16(sumi0, sumi1);
|
|
sumi0 = _mm256_sub_epi16(sumi0, ysum);
|
|
sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
|
|
|
|
sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
|
|
}
|
|
|
|
*s = hsum_float_8(sumf);
|
|
|
|
#else
|
|
float sumf = 0.0f;
|
|
|
|
for (int i = 0; i < nb; ++i) {
|
|
int32_t sumi = 0;
|
|
|
|
for (size_t j = 0; j < sizeof(x->qs); j += 32) {
|
|
for (size_t l = 0; l < 4; ++l) {
|
|
for (size_t k = 0; k < 32; ++k) {
|
|
sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
|
|
}
|
|
}
|
|
}
|
|
|
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
|
|
sumf += (float) sumi * d;
|
|
}
|
|
|
|
*s = sumf;
|
|
#endif
|
|
}
|
|
|
|
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const block_q2_K * GGML_RESTRICT x = vx;
|
|
const block_q8_K * GGML_RESTRICT y = vy;
|
|
|
|
const int nb = n / QK_K;
|
|
|
|
#if defined __AVX2__
|
|
|
|
const __m256i m3 = _mm256_set1_epi8(3);
|
|
const __m128i m4 = _mm_set1_epi8(0xF);
|
|
|
|
__m256 acc = _mm256_setzero_ps();
|
|
|
|
for (int i = 0; i < nb; ++i) {
|
|
|
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
|
|
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
|
|
const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
|
|
const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
|
|
const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
|
|
const __m256i mins = _mm256_cvtepi8_epi16(mins8);
|
|
const __m256i prod = _mm256_madd_epi16(mins, _mm256_loadu_si256((const __m256i*)y[i].bsums));
|
|
|
|
acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(prod), acc);
|
|
|
|
const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
|
|
const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
|
|
const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
|
|
const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
|
|
|
|
__m256i sumi = _mm256_setzero_si256();
|
|
|
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
|
|
const __m256i q2bits = _mm256_loadu_si256((const __m256i*)q2); q2 += 32;
|
|
|
|
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
|
|
const __m256i q2_0 = _mm256_and_si256(q2bits, m3);
|
|
const __m256i q2_1 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), m3);
|
|
const __m256i q2_2 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), m3);
|
|
const __m256i q2_3 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), m3);
|
|
|
|
__m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0);
|
|
__m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1);
|
|
__m256i p2 = _mm256_maddubs_epi16(q2_2, q8_2);
|
|
__m256i p3 = _mm256_maddubs_epi16(q2_3, q8_3);
|
|
|
|
p0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(0)), p0);
|
|
p1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(1)), p1);
|
|
p2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(2)), p2);
|
|
p3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(3)), p3);
|
|
|
|
p0 = _mm256_add_epi32(p0, p1);
|
|
p2 = _mm256_add_epi32(p2, p3);
|
|
|
|
sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p0, p2));
|
|
}
|
|
|
|
acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
|
|
|
|
}
|
|
|
|
*s = hsum_float_8(acc);
|
|
|
|
#elif defined __AVX__
|
|
|
|
const __m128i m3 = _mm_set1_epi8(0x3);
|
|
const __m128i m4 = _mm_set1_epi8(0xF);
|
|
const __m128i m2 = _mm_set1_epi8(0x2);
|
|
|
|
__m256 acc = _mm256_setzero_ps();
|
|
|
|
for (int i = 0; i < nb; ++i) {
|
|
|
|
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
|
|
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
|
|
// load mins and scales from block_q2_K.scales[QK_K/16]
|
|
const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
|
|
const __m128i scales16 = _mm_and_si128(mins_and_scales, m4);
|
|
const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
|
|
const __m128i mins_0 = _mm_cvtepi8_epi16(mins16);
|
|
const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16));
|
|
|
|
// summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2
|
|
const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0]));
|
|
const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
|
|
|
|
// sumf += -dmin * summs in 32bits*8
|
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
|
|
|
|
const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
|
|
const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
|
|
const __m128i scales[2] = { scales_0, scales_1 };
|
|
|
|
__m128i sumi_0 = _mm_setzero_si128();
|
|
__m128i sumi_1 = _mm_setzero_si128();
|
|
|
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
|
|
// load Q8 quants int8*16*8 from block_q8_K.qs[QK_K]
|
|
const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
|
|
// load 2bits*16*8 from block_q2_K.qs[QK_K/4]
|
|
__m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
|
|
const __m128i q2_0 = _mm_and_si128(q2bits, m3);
|
|
const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
|
|
const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
|
|
const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
|
|
q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
|
|
const __m128i q2_1 = _mm_and_si128(q2bits, m3);
|
|
const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
|
|
const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
|
|
const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
|
|
|
|
// isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8
|
|
__m128i p0 = _mm_maddubs_epi16(q2_0, q8_0);
|
|
__m128i p1 = _mm_maddubs_epi16(q2_1, q8_1);
|
|
__m128i p2 = _mm_maddubs_epi16(q2_2, q8_2);
|
|
__m128i p3 = _mm_maddubs_epi16(q2_3, q8_3);
|
|
__m128i p4 = _mm_maddubs_epi16(q2_4, q8_4);
|
|
__m128i p5 = _mm_maddubs_epi16(q2_5, q8_5);
|
|
__m128i p6 = _mm_maddubs_epi16(q2_6, q8_6);
|
|
__m128i p7 = _mm_maddubs_epi16(q2_7, q8_7);
|
|
|
|
// isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8
|
|
__m128i shuffle = _mm_set1_epi16(0x0100);
|
|
p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0);
|
|
shuffle = _mm_add_epi16(shuffle, m2);
|
|
p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1);
|
|
shuffle = _mm_add_epi16(shuffle, m2);
|
|
p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2);
|
|
shuffle = _mm_add_epi16(shuffle, m2);
|
|
p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3);
|
|
shuffle = _mm_add_epi16(shuffle, m2);
|
|
p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4);
|
|
shuffle = _mm_add_epi16(shuffle, m2);
|
|
p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5);
|
|
shuffle = _mm_add_epi16(shuffle, m2);
|
|
p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6);
|
|
shuffle = _mm_add_epi16(shuffle, m2);
|
|
p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7);
|
|
|
|
p0 = _mm_add_epi32(p0, p1);
|
|
p2 = _mm_add_epi32(p2, p3);
|
|
p4 = _mm_add_epi32(p4, p5);
|
|
p6 = _mm_add_epi32(p6, p7);
|
|
|
|
// isum in 32bits*4*2
|
|
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2));
|
|
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6));
|
|
}
|
|
|
|
// sumf += dall * isum - dmin * summs in 32bits
|
|
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
|
|
}
|
|
|
|
*s = hsum_float_8(acc);
|
|
|
|
#else
|
|
|
|
float sumf = 0;
|
|
|
|
for (int i = 0; i < nb; ++i) {
|
|
|
|
const uint8_t * q2 = x[i].qs;
|
|
const int8_t * q8 = y[i].qs;
|
|
const uint8_t * sc = x[i].scales;
|
|
|
|
int summs = 0;
|
|
for (int j = 0; j < 16; ++j) {
|
|
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
}
|
|
|
|
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
|
|
int isum = 0;
|
|
int is = 0;
|
|
int d;
|
|
for (int k = 0; k < QK_K/128; ++k) {
|
|
int shift = 0;
|
|
for (int j = 0; j < 4; ++j) {
|
|
d = sc[is++] & 0xF;
|
|
int isuml = 0;
|
|
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
isum += d * isuml;
|
|
d = sc[is++] & 0xF;
|
|
isuml = 0;
|
|
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
isum += d * isuml;
|
|
shift += 2;
|
|
q8 += 32;
|
|
}
|
|
q2 += 32;
|
|
}
|
|
sumf += dall * isum - dmin * summs;
|
|
}
|
|
*s = sumf;
|
|
#endif
|
|
}
|
|
|
|
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
assert(n % QK_K == 0);
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const uint32_t kmask1 = 0x03030303;
|
|
const uint32_t kmask2 = 0x0f0f0f0f;
|
|
|
|
const block_q3_K * GGML_RESTRICT x = vx;
|
|
const block_q8_K * GGML_RESTRICT y = vy;
|
|
|
|
const int nb = n / QK_K;
|
|
|
|
#if defined __AVX2__
|
|
|
|
const __m256i m3 = _mm256_set1_epi8(3);
|
|
const __m256i mone = _mm256_set1_epi8(1);
|
|
const __m128i m32 = _mm_set1_epi8(32);
|
|
|
|
__m256 acc = _mm256_setzero_ps();
|
|
|
|
uint32_t aux[3];
|
|
|
|
for (int i = 0; i < nb; ++i) {
|
|
|
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
|
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
|
|
// Set up scales
|
|
memcpy(aux, x[i].scales, 12);
|
|
__m128i scales128 = _mm_set_epi32(
|
|
((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
|
|
((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
|
|
(aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
|
|
(aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
|
|
scales128 = _mm_sub_epi8(scales128, m32);
|
|
const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
|
|
const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
|
|
const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
|
|
const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
|
|
|
|
// high bit
|
|
const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
|
|
|
|
// integer accumulator
|
|
__m256i sumi = _mm256_setzero_si256();
|
|
|
|
int bit = 0;
|
|
int is = 0;
|
|
|
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
// load low 2 bits
|
|
const __m256i q3bits = _mm256_loadu_si256((const __m256i*)q3); q3 += 32;
|
|
|
|
// prepare low and high bits
|
|
const __m256i q3l_0 = _mm256_and_si256(q3bits, m3);
|
|
const __m256i q3h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
|
|
++bit;
|
|
|
|
const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 2), m3);
|
|
const __m256i q3h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
|
|
++bit;
|
|
|
|
const __m256i q3l_2 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 4), m3);
|
|
const __m256i q3h_2 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
|
|
++bit;
|
|
|
|
const __m256i q3l_3 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 6), m3);
|
|
const __m256i q3h_3 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
|
|
++bit;
|
|
|
|
// load Q8 quants
|
|
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
|
|
// Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
|
|
// and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
|
|
// and 2 if the high bit was set)
|
|
__m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0);
|
|
__m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1);
|
|
__m256i q8s_2 = _mm256_maddubs_epi16(q3h_2, q8_2);
|
|
__m256i q8s_3 = _mm256_maddubs_epi16(q3h_3, q8_3);
|
|
|
|
__m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0);
|
|
__m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1);
|
|
__m256i p16_2 = _mm256_maddubs_epi16(q3l_2, q8_2);
|
|
__m256i p16_3 = _mm256_maddubs_epi16(q3l_3, q8_3);
|
|
|
|
p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
|
|
p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
|
|
p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
|
|
p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
|
|
|
|
// multiply with scales
|
|
p16_0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0);
|
|
p16_1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1);
|
|
p16_2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2);
|
|
p16_3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3);
|
|
|
|
// accumulate
|
|
p16_0 = _mm256_add_epi32(p16_0, p16_1);
|
|
p16_2 = _mm256_add_epi32(p16_2, p16_3);
|
|
sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_2));
|
|
|
|
}
|
|
|
|
// multiply with block scale and accumulate
|
|
acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
|
|
|
|
}
|
|
|
|
*s = hsum_float_8(acc);
|
|
|
|
#elif defined __AVX__
|
|
|
|
const __m128i m3 = _mm_set1_epi8(3);
|
|
const __m128i mone = _mm_set1_epi8(1);
|
|
const __m128i m32 = _mm_set1_epi8(32);
|
|
const __m128i m2 = _mm_set1_epi8(2);
|
|
|
|
__m256 acc = _mm256_setzero_ps();
|
|
|
|
const uint32_t *aux;
|
|
|
|
for (int i = 0; i < nb; ++i) {
|
|
|
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
|
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
|
|
// Set up scales
|
|
aux = (const uint32_t *)x[i].scales;
|
|
__m128i scales128 = _mm_set_epi32(
|
|
((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
|
|
((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
|
|
(aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
|
|
(aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
|
|
scales128 = _mm_sub_epi8(scales128, m32);
|
|
const __m128i scales_0 = _mm_cvtepi8_epi16(scales128);
|
|
const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128));
|
|
const __m128i scales[2] = { scales_0, scales_1 };
|
|
|
|
// high bit *128*2 from block_q3_K.hmask[QK_K/8]
|
|
const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]);
|
|
const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]);
|
|
|
|
// integer accumulator
|
|
__m128i sumi_0 = _mm_setzero_si128();
|
|
__m128i sumi_1 = _mm_setzero_si128();
|
|
|
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
// load low 2 bits *64*2 from block_q3_K.qs[QK_K/4]
|
|
const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
|
|
const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
|
|
|
|
// prepare low and high bits
|
|
const int bit = j << 2;
|
|
|
|
const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3);
|
|
const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3);
|
|
const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2);
|
|
const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2);
|
|
|
|
const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3);
|
|
const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3);
|
|
const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
|
|
const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
|
|
|
|
const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3);
|
|
const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3);
|
|
const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
|
|
const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
|
|
|
|
const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3);
|
|
const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3);
|
|
const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
|
|
const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
|
|
|
|
// load Q8 quants from block_q8_K.qs[QK_K]
|
|
const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
|
|
// Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
|
|
// and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
|
|
// and 2 if the high bit was set)
|
|
__m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0);
|
|
__m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1);
|
|
__m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2);
|
|
__m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3);
|
|
__m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4);
|
|
__m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5);
|
|
__m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6);
|
|
__m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7);
|
|
|
|
__m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0);
|
|
__m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1);
|
|
__m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2);
|
|
__m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3);
|
|
__m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4);
|
|
__m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5);
|
|
__m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6);
|
|
__m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7);
|
|
|
|
p16_0 = _mm_sub_epi16(p16_0, q8s_0);
|
|
p16_1 = _mm_sub_epi16(p16_1, q8s_1);
|
|
p16_2 = _mm_sub_epi16(p16_2, q8s_2);
|
|
p16_3 = _mm_sub_epi16(p16_3, q8s_3);
|
|
p16_4 = _mm_sub_epi16(p16_4, q8s_4);
|
|
p16_5 = _mm_sub_epi16(p16_5, q8s_5);
|
|
p16_6 = _mm_sub_epi16(p16_6, q8s_6);
|
|
p16_7 = _mm_sub_epi16(p16_7, q8s_7);
|
|
|
|
// multiply with scales
|
|
__m128i shuffle = _mm_set1_epi16(0x0100);
|
|
p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0);
|
|
shuffle = _mm_add_epi16(shuffle, m2);
|
|
p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1);
|
|
shuffle = _mm_add_epi16(shuffle, m2);
|
|
p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2);
|
|
shuffle = _mm_add_epi16(shuffle, m2);
|
|
p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3);
|
|
shuffle = _mm_add_epi16(shuffle, m2);
|
|
p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4);
|
|
shuffle = _mm_add_epi16(shuffle, m2);
|
|
p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5);
|
|
shuffle = _mm_add_epi16(shuffle, m2);
|
|
p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6);
|
|
shuffle = _mm_add_epi16(shuffle, m2);
|
|
p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7);
|
|
|
|
// accumulate
|
|
p16_0 = _mm_add_epi32(p16_0, p16_1);
|
|
p16_2 = _mm_add_epi32(p16_2, p16_3);
|
|
p16_4 = _mm_add_epi32(p16_4, p16_5);
|
|
p16_6 = _mm_add_epi32(p16_6, p16_7);
|
|
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
|
|
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6));
|
|
|
|
}
|
|
|
|
// multiply with block scale and accumulate
|
|
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
|
|
|
|
}
|
|
|
|
*s = hsum_float_8(acc);
|
|
|
|
#else
|
|
// scalar version
|
|
// This function is written like this so the compiler can manage to vectorize most of it
|
|
// Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
|
|
// manually vectorized version above. Every other version I tried would run at least 4 times slower.
|
|
// The ideal situation would be if we could just write the code once, and the compiler would
|
|
// automatically produce the best possible set of machine instructions, instead of us having to manually
|
|
// write vectorized versions for AVX, ARM_NEON, etc.
|
|
|
|
int8_t aux8[QK_K];
|
|
int16_t aux16[8];
|
|
float sums [8];
|
|
int32_t aux32[8];
|
|
memset(sums, 0, 8*sizeof(float));
|
|
|
|
uint32_t auxs[4];
|
|
const int8_t * scales = (const int8_t*)auxs;
|
|
|
|
float sumf = 0;
|
|
for (int i = 0; i < nb; ++i) {
|
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
memset(aux32, 0, 8*sizeof(int32_t));
|
|
int8_t * GGML_RESTRICT a = aux8;
|
|
uint8_t m = 1;
|
|
for (int j = 0; j < QK_K; j += 128) {
|
|
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
|
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
a += 32; m <<= 1;
|
|
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
|
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
a += 32; m <<= 1;
|
|
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
|
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
a += 32; m <<= 1;
|
|
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
|
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
a += 32; m <<= 1;
|
|
q3 += 32;
|
|
}
|
|
a = aux8;
|
|
|
|
memcpy(auxs, x[i].scales, 12);
|
|
uint32_t tmp = auxs[2];
|
|
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
|
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
|
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
|
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
|
for (int j = 0; j < QK_K/16; ++j) {
|
|
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
q8 += 8; a += 8;
|
|
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
q8 += 8; a += 8;
|
|
}
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
}
|
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
*s = sumf;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
assert(n % QK_K == 0);
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const block_q4_K * GGML_RESTRICT x = vx;
|
|
const block_q8_K * GGML_RESTRICT y = vy;
|
|
|
|
const int nb = n / QK_K;
|
|
|
|
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
static const uint32_t kmask3 = 0x03030303;
|
|
|
|
uint32_t utmp[4];
|
|
|
|
#if defined __AVX2__
|
|
|
|
const __m256i m4 = _mm256_set1_epi8(0xF);
|
|
|
|
__m256 acc = _mm256_setzero_ps();
|
|
__m128 acc_m = _mm_setzero_ps();
|
|
|
|
for (int i = 0; i < nb; ++i) {
|
|
|
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
|
|
memcpy(utmp, x[i].scales, 12);
|
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
const uint32_t uaux = utmp[1] & kmask1;
|
|
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
utmp[2] = uaux;
|
|
utmp[0] &= kmask1;
|
|
|
|
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
|
|
const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
|
|
|
|
const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
|
|
const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
|
|
const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
|
|
acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
|
|
|
|
const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
|
|
const __m256i scales = MM256_SET_M128I(sc128, sc128);
|
|
|
|
__m256i sumi = _mm256_setzero_si256();
|
|
|
|
for (int j = 0; j < QK_K/64; ++j) {
|
|
|
|
const __m256i scale_l = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
|
|
const __m256i scale_h = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
|
|
|
|
const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
|
|
const __m256i q4l = _mm256_and_si256(q4bits, m4);
|
|
const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4);
|
|
|
|
const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
__m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
|
|
p16l = _mm256_madd_epi16(scale_l, p16l);
|
|
|
|
const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
__m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
|
|
p16h = _mm256_madd_epi16(scale_h, p16h);
|
|
const __m256i sumj = _mm256_add_epi32(p16l, p16h);
|
|
|
|
sumi = _mm256_add_epi32(sumi, sumj);
|
|
}
|
|
|
|
__m256 vd = _mm256_set1_ps(d);
|
|
acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
|
|
|
|
}
|
|
|
|
acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
|
|
acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
|
|
|
|
*s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
|
|
|
|
#elif defined __AVX__
|
|
|
|
const __m128i m4 = _mm_set1_epi8(0xF);
|
|
const __m128i m2 = _mm_set1_epi8(0x2);
|
|
|
|
__m256 acc = _mm256_setzero_ps();
|
|
__m128 acc_m = _mm_setzero_ps();
|
|
|
|
for (int i = 0; i < nb; ++i) {
|
|
|
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
|
|
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
|
|
memcpy(utmp, x[i].scales, 12);
|
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
const uint32_t uaux = utmp[1] & kmask1;
|
|
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
utmp[2] = uaux;
|
|
utmp[0] &= kmask1;
|
|
|
|
const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
|
|
const __m128i scales = _mm_cvtepu8_epi16(utmps);
|
|
const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
|
|
|
|
const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
|
|
const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
|
|
const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
|
|
const __m128i prod = _mm_madd_epi16(mins, q8s);
|
|
acc_m = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod)), acc_m);
|
|
|
|
__m128i sumi_0 = _mm_setzero_si128();
|
|
__m128i sumi_1 = _mm_setzero_si128();
|
|
|
|
__m128i shuffle = _mm_set1_epi16(0x0100);
|
|
for (int j = 0; j < QK_K/64; ++j) {
|
|
|
|
const __m128i scale_l = _mm_shuffle_epi8(scales, shuffle);
|
|
shuffle = _mm_add_epi16(shuffle, m2);
|
|
const __m128i scale_h = _mm_shuffle_epi8(scales, shuffle);
|
|
shuffle = _mm_add_epi16(shuffle, m2);
|
|
|
|
__m128i q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
|
|
const __m128i q4l_0 = _mm_and_si128(q4bits, m4);
|
|
const __m128i q4h_0 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
|
|
q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
|
|
const __m128i q4l_1 = _mm_and_si128(q4bits, m4);
|
|
const __m128i q4h_1 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
|
|
|
|
const __m128i q8l_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
__m128i p16l = _mm_maddubs_epi16(q4l_0, q8l_0);
|
|
p16l = _mm_madd_epi16(scale_l, p16l);
|
|
sumi_0 = _mm_add_epi32(sumi_0, p16l);
|
|
const __m128i q8l_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
p16l = _mm_maddubs_epi16(q4l_1, q8l_1);
|
|
p16l = _mm_madd_epi16(scale_l, p16l);
|
|
sumi_1 = _mm_add_epi32(sumi_1, p16l);
|
|
|
|
const __m128i q8h_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
__m128i p16h = _mm_maddubs_epi16(q4h_0, q8h_0);
|
|
p16h = _mm_madd_epi16(scale_h, p16h);
|
|
sumi_0 = _mm_add_epi32(sumi_0, p16h);
|
|
const __m128i q8h_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
p16h = _mm_maddubs_epi16(q4h_1, q8h_1);
|
|
p16h = _mm_madd_epi16(scale_h, p16h);
|
|
sumi_1 = _mm_add_epi32(sumi_1, p16h);
|
|
|
|
}
|
|
|
|
__m256 vd = _mm256_set1_ps(d);
|
|
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
|
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
|
|
|
|
}
|
|
|
|
acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
|
|
acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
|
|
|
|
*s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
|
|
|
|
#else
|
|
|
|
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
|
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
|
|
|
int8_t aux8[QK_K];
|
|
int16_t aux16[8];
|
|
float sums [8];
|
|
int32_t aux32[8];
|
|
memset(sums, 0, 8*sizeof(float));
|
|
|
|
float sumf = 0;
|
|
for (int i = 0; i < nb; ++i) {
|
|
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
memset(aux32, 0, 8*sizeof(int32_t));
|
|
int8_t * GGML_RESTRICT a = aux8;
|
|
for (int j = 0; j < QK_K/64; ++j) {
|
|
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
a += 32;
|
|
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
a += 32; q4 += 32;
|
|
}
|
|
memcpy(utmp, x[i].scales, 12);
|
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
const uint32_t uaux = utmp[1] & kmask1;
|
|
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
utmp[2] = uaux;
|
|
utmp[0] &= kmask1;
|
|
|
|
int sumi = 0;
|
|
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
a = aux8;
|
|
int is = 0;
|
|
for (int j = 0; j < QK_K/32; ++j) {
|
|
int32_t scale = scales[is++];
|
|
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
q8 += 8; a += 8;
|
|
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
q8 += 8; a += 8;
|
|
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
q8 += 8; a += 8;
|
|
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
q8 += 8; a += 8;
|
|
}
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
sumf -= dmin * sumi;
|
|
}
|
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
*s = sumf;
|
|
#endif
|
|
}
|
|
|
|
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
assert(n % QK_K == 0);
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const block_q5_K * GGML_RESTRICT x = vx;
|
|
const block_q8_K * GGML_RESTRICT y = vy;
|
|
|
|
const int nb = n / QK_K;
|
|
|
|
static const uint32_t kmask1 = 0x3f3f3f3f;
|
|
static const uint32_t kmask2 = 0x0f0f0f0f;
|
|
static const uint32_t kmask3 = 0x03030303;
|
|
|
|
uint32_t utmp[4];
|
|
|
|
#if defined __AVX2__
|
|
|
|
const __m256i m4 = _mm256_set1_epi8(0xF);
|
|
const __m128i mzero = _mm_setzero_si128();
|
|
const __m256i mone = _mm256_set1_epi8(1);
|
|
|
|
__m256 acc = _mm256_setzero_ps();
|
|
|
|
float summs = 0.f;
|
|
|
|
for (int i = 0; i < nb; ++i) {
|
|
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
|
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
|
|
memcpy(utmp, x[i].scales, 12);
|
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
const uint32_t uaux = utmp[1] & kmask1;
|
|
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
utmp[2] = uaux;
|
|
utmp[0] &= kmask1;
|
|
|
|
const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
|
|
|
|
const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
|
|
const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
|
|
const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
|
|
const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
|
|
summs += dmin * _mm_extract_epi32(hsum, 0);
|
|
|
|
const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
|
|
const __m256i scales = MM256_SET_M128I(sc128, sc128);
|
|
|
|
const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
|
|
__m256i hmask = mone;
|
|
|
|
__m256i sumi = _mm256_setzero_si256();
|
|
|
|
int bit = 0;
|
|
|
|
for (int j = 0; j < QK_K/64; ++j) {
|
|
|
|
const __m256i scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
|
|
const __m256i scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
|
|
|
|
const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); q5 += 32;
|
|
|
|
const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
|
|
const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
|
|
const __m256i q5_0 = _mm256_add_epi8(q5l_0, q5h_0);
|
|
hmask = _mm256_slli_epi16(hmask, 1);
|
|
|
|
const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
|
|
const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
|
|
const __m256i q5_1 = _mm256_add_epi8(q5l_1, q5h_1);
|
|
hmask = _mm256_slli_epi16(hmask, 1);
|
|
|
|
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
|
|
__m256i p16_0 = _mm256_maddubs_epi16(q5_0, q8_0);
|
|
__m256i p16_1 = _mm256_maddubs_epi16(q5_1, q8_1);
|
|
|
|
p16_0 = _mm256_madd_epi16(scale_0, p16_0);
|
|
p16_1 = _mm256_madd_epi16(scale_1, p16_1);
|
|
|
|
sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
|
|
|
|
}
|
|
|
|
__m256 vd = _mm256_set1_ps(d);
|
|
acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
|
|
|
|
}
|
|
|
|
*s = hsum_float_8(acc) + summs;
|
|
|
|
#elif defined __AVX__
|
|
|
|
const __m128i m4 = _mm_set1_epi8(0xF);
|
|
const __m128i mzero = _mm_setzero_si128();
|
|
const __m128i mone = _mm_set1_epi8(1);
|
|
const __m128i m2 = _mm_set1_epi8(2);
|
|
|
|
__m256 acc = _mm256_setzero_ps();
|
|
|
|
float summs = 0.f;
|
|
|
|
for (int i = 0; i < nb; ++i) {
|
|
|
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
|
|
|
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
|
|
memcpy(utmp, x[i].scales, 12);
|
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
const uint32_t uaux = utmp[1] & kmask1;
|
|
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
utmp[2] = uaux;
|
|
utmp[0] &= kmask1;
|
|
|
|
const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
|
|
const __m128i scales = _mm_cvtepu8_epi16(utmps);
|
|
const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
|
|
|
|
const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
|
|
const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
|
|
const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
|
|
const __m128i prod = _mm_madd_epi16(mins, q8s);
|
|
const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
|
|
summs += dmin * _mm_extract_epi32(hsum, 0);
|
|
|
|
const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].qh[0]);
|
|
const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].qh[16]);
|
|
__m128i hmask = mone;
|
|
|
|
__m128i sumi_0 = _mm_setzero_si128();
|
|
__m128i sumi_1 = _mm_setzero_si128();
|
|
|
|
int bit = 0;
|
|
|
|
__m128i shuffle = _mm_set1_epi16(0x0100);
|
|
for (int j = 0; j < QK_K/64; ++j) {
|
|
|
|
const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
|
|
shuffle = _mm_add_epi16(shuffle, m2);
|
|
const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
|
|
shuffle = _mm_add_epi16(shuffle, m2);
|
|
|
|
const __m128i q5bits_0 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
|
|
const __m128i q5bits_1 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
|
|
|
|
__m128i q5l_0 = _mm_and_si128(q5bits_0, m4);
|
|
__m128i q5l_1 = _mm_and_si128(q5bits_1, m4);
|
|
__m128i q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
|
|
__m128i q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
|
|
__m128i q5_0 = _mm_add_epi8(q5l_0, q5h_0);
|
|
__m128i q5_1 = _mm_add_epi8(q5l_1, q5h_1);
|
|
hmask = _mm_slli_epi16(hmask, 1);
|
|
|
|
__m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
__m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
__m128i p16_0 = _mm_maddubs_epi16(q5_0, q8_0);
|
|
__m128i p16_1 = _mm_maddubs_epi16(q5_1, q8_1);
|
|
p16_0 = _mm_madd_epi16(scale_0, p16_0);
|
|
p16_1 = _mm_madd_epi16(scale_0, p16_1);
|
|
|
|
q5l_0 = _mm_and_si128(_mm_srli_epi16(q5bits_0, 4), m4);
|
|
q5l_1 = _mm_and_si128(_mm_srli_epi16(q5bits_1, 4), m4);
|
|
q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
|
|
q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
|
|
q5_0 = _mm_add_epi8(q5l_0, q5h_0);
|
|
q5_1 = _mm_add_epi8(q5l_1, q5h_1);
|
|
hmask = _mm_slli_epi16(hmask, 1);
|
|
|
|
q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
__m128i p16_2 = _mm_maddubs_epi16(q5_0, q8_0);
|
|
__m128i p16_3 = _mm_maddubs_epi16(q5_1, q8_1);
|
|
p16_2 = _mm_madd_epi16(scale_1, p16_2);
|
|
p16_3 = _mm_madd_epi16(scale_1, p16_3);
|
|
|
|
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
|
|
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
|
|
|
|
}
|
|
|
|
__m256 vd = _mm256_set1_ps(d);
|
|
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
|
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
|
|
|
|
}
|
|
|
|
*s = hsum_float_8(acc) + summs;
|
|
|
|
#else
|
|
|
|
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
|
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
|
|
|
int8_t aux8[QK_K];
|
|
int16_t aux16[8];
|
|
float sums [8];
|
|
int32_t aux32[8];
|
|
memset(sums, 0, 8*sizeof(float));
|
|
|
|
float sumf = 0;
|
|
for (int i = 0; i < nb; ++i) {
|
|
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
memset(aux32, 0, 8*sizeof(int32_t));
|
|
int8_t * GGML_RESTRICT a = aux8;
|
|
uint8_t m = 1;
|
|
for (int j = 0; j < QK_K/64; ++j) {
|
|
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
a += 32; m <<= 1;
|
|
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
a += 32; m <<= 1;
|
|
q4 += 32;
|
|
}
|
|
memcpy(utmp, x[i].scales, 12);
|
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
const uint32_t uaux = utmp[1] & kmask1;
|
|
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
utmp[2] = uaux;
|
|
utmp[0] &= kmask1;
|
|
|
|
int sumi = 0;
|
|
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
a = aux8;
|
|
int is = 0;
|
|
for (int j = 0; j < QK_K/32; ++j) {
|
|
int32_t scale = scales[is++];
|
|
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
q8 += 8; a += 8;
|
|
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
q8 += 8; a += 8;
|
|
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
q8 += 8; a += 8;
|
|
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
q8 += 8; a += 8;
|
|
}
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
sumf -= dmin * sumi;
|
|
}
|
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
*s = sumf;
|
|
#endif
|
|
}
|
|
|
|
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
assert(n % QK_K == 0);
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const block_q6_K * GGML_RESTRICT x = vx;
|
|
const block_q8_K * GGML_RESTRICT y = vy;
|
|
|
|
const int nb = n / QK_K;
|
|
|
|
#if defined __AVX2__
|
|
|
|
const __m256i m4 = _mm256_set1_epi8(0xF);
|
|
const __m256i m2 = _mm256_set1_epi8(3);
|
|
const __m256i m32s = _mm256_set1_epi8(32);
|
|
|
|
__m256 acc = _mm256_setzero_ps();
|
|
|
|
for (int i = 0; i < nb; ++i) {
|
|
|
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
|
|
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
|
|
const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
|
|
|
|
__m256i sumi = _mm256_setzero_si256();
|
|
|
|
int is = 0;
|
|
|
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
|
|
const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
|
|
const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
|
|
const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
|
|
const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
|
|
is += 4;
|
|
|
|
const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
|
|
const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
|
|
const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;
|
|
|
|
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
|
|
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
|
|
const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
|
|
const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
|
|
|
|
const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
|
|
const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
|
|
const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
|
|
const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
|
|
|
|
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
|
|
__m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
|
|
__m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
|
|
__m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
|
|
__m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
|
|
|
|
__m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
|
|
__m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
|
|
__m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
|
|
__m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
|
|
|
|
p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
|
|
p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
|
|
p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
|
|
p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
|
|
|
|
p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
|
|
p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
|
|
p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2);
|
|
p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3);
|
|
|
|
sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
|
|
sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3));
|
|
|
|
}
|
|
|
|
acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
|
|
}
|
|
|
|
*s = hsum_float_8(acc);
|
|
|
|
#elif defined __AVX__
|
|
|
|
const __m128i m3 = _mm_set1_epi8(3);
|
|
const __m128i m15 = _mm_set1_epi8(15);
|
|
|
|
__m256 acc = _mm256_setzero_ps();
|
|
|
|
for (int i = 0; i < nb; ++i) {
|
|
|
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
|
|
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
|
|
// handle the q6_k -32 offset separately using bsums
|
|
const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
|
|
const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1);
|
|
const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
|
|
const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales);
|
|
const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8));
|
|
const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5);
|
|
const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5);
|
|
|
|
__m128i sumi_0 = _mm_setzero_si128();
|
|
__m128i sumi_1 = _mm_setzero_si128();
|
|
|
|
int is = 0;
|
|
|
|
for (int j = 0; j < QK_K/128; ++j) {
|
|
|
|
const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
|
|
const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
|
|
|
|
const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
|
|
const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
|
|
const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2);
|
|
const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2);
|
|
const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48));
|
|
const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48));
|
|
const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2);
|
|
const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2);
|
|
|
|
const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
|
|
const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
|
|
const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
|
|
const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
|
|
|
|
const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0);
|
|
const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1);
|
|
const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2);
|
|
const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3);
|
|
const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4);
|
|
const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5);
|
|
const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6);
|
|
const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7);
|
|
|
|
const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
|
|
|
|
__m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
|
|
__m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
|
|
__m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
|
|
__m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
|
|
__m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
|
|
__m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
|
|
__m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
|
|
__m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
|
|
|
|
const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
|
|
const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
|
|
const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
|
|
const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
|
|
is += 4;
|
|
|
|
p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
|
|
p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1);
|
|
p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
|
|
p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3);
|
|
p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
|
|
p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5);
|
|
p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
|
|
p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7);
|
|
|
|
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
|
|
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
|
|
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
|
|
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
|
|
|
|
}
|
|
|
|
sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0);
|
|
sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1);
|
|
const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
|
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc);
|
|
}
|
|
|
|
*s = hsum_float_8(acc);
|
|
|
|
#else
|
|
|
|
int8_t aux8[QK_K];
|
|
int16_t aux16[8];
|
|
float sums [8];
|
|
int32_t aux32[8];
|
|
memset(sums, 0, 8*sizeof(float));
|
|
|
|
float sumf = 0;
|
|
for (int i = 0; i < nb; ++i) {
|
|
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
memset(aux32, 0, 8*sizeof(int32_t));
|
|
int8_t * GGML_RESTRICT a = aux8;
|
|
for (int j = 0; j < QK_K; j += 128) {
|
|
for (int l = 0; l < 32; ++l) {
|
|
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
|
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
|
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
|
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
|
}
|
|
a += 128;
|
|
q4 += 64;
|
|
qh += 32;
|
|
}
|
|
a = aux8;
|
|
int is = 0;
|
|
for (int j = 0; j < QK_K/16; ++j) {
|
|
int scale = x[i].scales[is++];
|
|
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
q8 += 8; a += 8;
|
|
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
q8 += 8; a += 8;
|
|
}
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
}
|
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
*s = sumf;
|
|
#endif
|
|
}
|
|
|
|
#if defined (__AVX__) || defined (__AVX2__)
|
|
static const int8_t keven_signs_q2xs[1024] = {
|
|
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
|
|
1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1,
|
|
1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1,
|
|
1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1,
|
|
1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1,
|
|
1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1,
|
|
1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1,
|
|
1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1,
|
|
1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1,
|
|
1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1,
|
|
1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1,
|
|
1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1,
|
|
1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1,
|
|
1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1,
|
|
1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1,
|
|
1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1,
|
|
1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1,
|
|
1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1,
|
|
1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1,
|
|
1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1,
|
|
1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1,
|
|
1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1,
|
|
1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1,
|
|
1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1,
|
|
1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1,
|
|
1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1,
|
|
1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1,
|
|
1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
|
|
1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
};
|
|
#endif
|
|
|
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
assert(n % QK_K == 0);
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const block_iq2_xxs * GGML_RESTRICT x = vx;
|
|
const block_q8_K * GGML_RESTRICT y = vy;
|
|
|
|
const int nb = n / QK_K;
|
|
|
|
#if defined(__AVX2__)
|
|
|
|
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
|
|
|
uint32_t aux32[4];
|
|
const uint8_t * aux8 = (const uint8_t *)aux32;
|
|
|
|
__m256 accumf = _mm256_setzero_ps();
|
|
for (int i = 0; i < nb; ++i) {
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
__m256i sumi2 = _mm256_setzero_si256();
|
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
|
|
const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
|
|
const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
|
|
const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
|
|
signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
|
|
const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
|
|
signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]);
|
|
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
|
|
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
|
|
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
|
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
|
const uint16_t ls1 = aux32[1] >> 28;
|
|
const uint16_t ls2 = aux32[3] >> 28;
|
|
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
|
|
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
|
|
sumi1 = _mm256_add_epi32(sumi1, p1);
|
|
sumi2 = _mm256_add_epi32(sumi2, p2);
|
|
}
|
|
|
|
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
|
|
|
}
|
|
|
|
*s = 0.125f * hsum_float_8(accumf);
|
|
|
|
#elif defined(__AVX__)
|
|
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
|
|
|
uint32_t aux32[4];
|
|
const uint8_t * aux8 = (const uint8_t *)aux32;
|
|
|
|
__m256 accumf = _mm256_setzero_ps();
|
|
for (int i = 0; i < nb; ++i) {
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
__m128i sumi1_0 = _mm_setzero_si128();
|
|
__m128i sumi1_1 = _mm_setzero_si128();
|
|
__m128i sumi2_0 = _mm_setzero_si128();
|
|
__m128i sumi2_1 = _mm_setzero_si128();
|
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
|
|
const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
|
|
const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]);
|
|
const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
|
|
const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]);
|
|
const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
|
|
const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
|
|
const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]);
|
|
const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]);
|
|
const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
|
|
const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
|
|
const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
|
|
const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
|
|
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
|
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
|
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
|
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
|
const uint16_t ls1 = aux32[1] >> 28;
|
|
const uint16_t ls2 = aux32[3] >> 28;
|
|
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
|
|
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
|
|
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
|
|
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
|
|
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
|
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
|
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
|
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
|
}
|
|
|
|
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
|
|
|
}
|
|
|
|
*s = 0.125f * hsum_float_8(accumf);
|
|
|
|
#else
|
|
|
|
uint32_t aux32[2];
|
|
const uint8_t * aux8 = (const uint8_t *)aux32;
|
|
|
|
float sumf = 0.f;
|
|
for (int i = 0; i < nb; ++i) {
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
int32_t bsum = 0;
|
|
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
memcpy(aux32, q2, 2*sizeof(uint32_t));
|
|
q2 += 4;
|
|
const uint32_t ls = 2*(aux32[1] >> 28) + 1;
|
|
int32_t sumi = 0;
|
|
for (int l = 0; l < 4; ++l) {
|
|
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
|
|
const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
|
|
for (int j = 0; j < 8; ++j) {
|
|
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
}
|
|
q8 += 8;
|
|
}
|
|
bsum += sumi * ls;
|
|
}
|
|
sumf += d * bsum;
|
|
}
|
|
*s = 0.125f * sumf;
|
|
#endif
|
|
}
|
|
|
|
void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
assert(n % QK_K == 0);
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const block_iq2_xs * GGML_RESTRICT x = vx;
|
|
const block_q8_K * GGML_RESTRICT y = vy;
|
|
|
|
const int nb = n / QK_K;
|
|
|
|
#if defined(__AVX2__)
|
|
|
|
const __m256i mone = _mm256_set1_epi8(1);
|
|
static const char block_sign_shuffle_mask_1[32] = {
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
|
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
|
|
};
|
|
static const char block_sign_shuffle_mask_2[32] = {
|
|
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
|
|
0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
|
|
};
|
|
static const uint8_t bit_selector_mask_bytes[32] = {
|
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
|
};
|
|
|
|
const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
|
|
const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
|
|
const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
|
|
|
|
static const uint8_t k_bit_helper[32] = {
|
|
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
|
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
|
};
|
|
const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
|
|
const __m256i m511 = _mm256_set1_epi16(511);
|
|
const __m128i m4 = _mm_set1_epi8(0xf);
|
|
const __m128i m1 = _mm_set1_epi8(1);
|
|
|
|
uint64_t aux64;
|
|
|
|
// somewhat hacky, but gives a significant boost in performance
|
|
__m256i aux_gindex;
|
|
const uint16_t * gindex = (const uint16_t *)&aux_gindex;
|
|
|
|
__m256 accumf = _mm256_setzero_ps();
|
|
for (int i = 0; i < nb; ++i) {
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
|
|
memcpy(&aux64, x[i].scales, 8);
|
|
__m128i stmp = _mm_set1_epi64x(aux64);
|
|
stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
|
|
const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
|
|
|
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
__m256i sumi2 = _mm256_setzero_si256();
|
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
|
|
|
|
const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2); q2 += 16;
|
|
aux_gindex = _mm256_and_si256(q2_data, m511);
|
|
|
|
const __m256i partial_sign_bits = _mm256_srli_epi16(q2_data, 9);
|
|
const __m256i partial_sign_bits_upper = _mm256_srli_epi16(q2_data, 13);
|
|
const __m256i partial_sign_bits_for_counting = _mm256_xor_si256(partial_sign_bits, partial_sign_bits_upper);
|
|
|
|
const __m256i odd_bits = _mm256_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
|
|
const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, odd_bits);
|
|
|
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
|
|
const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
|
|
iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
|
|
const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
|
|
iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
|
|
const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
|
|
iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
|
|
const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
|
|
iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
|
|
|
|
const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
|
|
const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
|
|
const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
|
|
const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
|
|
|
|
__m256i signs;
|
|
signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
|
|
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
|
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
|
|
|
|
signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_2);
|
|
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
|
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
|
|
|
|
signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_1);
|
|
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
|
const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone));
|
|
|
|
signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_2);
|
|
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
|
const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone));
|
|
|
|
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
|
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
|
const __m256i dot3 = _mm256_maddubs_epi16(q2_3, q8s_3);
|
|
const __m256i dot4 = _mm256_maddubs_epi16(q2_4, q8s_4);
|
|
|
|
const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
|
|
const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
|
|
const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2)));
|
|
const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3)));
|
|
|
|
sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
|
|
sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
|
|
sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3));
|
|
sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4));
|
|
}
|
|
|
|
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
|
|
|
}
|
|
|
|
*s = 0.125f * hsum_float_8(accumf);
|
|
|
|
#elif defined(__AVX__)
|
|
const __m128i mone = _mm_set1_epi8(1);
|
|
static const char block_sign_shuffle_mask_1[32] = {
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
|
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
|
|
};
|
|
static const char block_sign_shuffle_mask_2[32] = {
|
|
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
|
|
0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
|
|
};
|
|
static const uint8_t bit_selector_mask_bytes[32] = {
|
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
|
};
|
|
|
|
const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes);
|
|
const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1);
|
|
const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1);
|
|
const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1);
|
|
const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2);
|
|
const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1);
|
|
|
|
static const uint8_t k_bit_helper[32] = {
|
|
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
|
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
|
};
|
|
const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper);
|
|
const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1);
|
|
const __m128i m511 = _mm_set1_epi16(511);
|
|
const __m128i m4 = _mm_set1_epi8(0xf);
|
|
const __m128i m1 = _mm_set1_epi8(1);
|
|
|
|
uint64_t aux64;
|
|
|
|
// somewhat hacky, but gives a significant boost in performance
|
|
__m256i aux_gindex;
|
|
const uint16_t * gindex = (const uint16_t *)&aux_gindex;
|
|
|
|
__m256 accumf = _mm256_setzero_ps();
|
|
for (int i = 0; i < nb; ++i) {
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
|
|
memcpy(&aux64, x[i].scales, 8);
|
|
__m128i stmp = _mm_set1_epi64x(aux64);
|
|
stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
|
|
const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
|
|
|
|
__m128i sumi1_0 = _mm_setzero_si128();
|
|
__m128i sumi1_1 = _mm_setzero_si128();
|
|
__m128i sumi2_0 = _mm_setzero_si128();
|
|
__m128i sumi2_1 = _mm_setzero_si128();
|
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
|
|
|
|
const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2);
|
|
const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1); q2 += 16;
|
|
aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511));
|
|
|
|
const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9);
|
|
const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9);
|
|
const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13);
|
|
const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13);
|
|
const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0);
|
|
const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1);
|
|
|
|
const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0);
|
|
const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1);
|
|
const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0);
|
|
const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1);
|
|
|
|
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
|
|
const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
|
|
const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]);
|
|
const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
|
|
const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]);
|
|
const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]);
|
|
const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]);
|
|
const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
|
|
const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]);
|
|
|
|
// AVX2 full_signs_1 is full_sign_bits_0 here
|
|
// AVX2 full_signs_2 is full_sign_bits_1 here
|
|
__m128i signs_0, signs_1;
|
|
signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0);
|
|
signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1);
|
|
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
|
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
|
const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone));
|
|
const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone));
|
|
|
|
signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0);
|
|
signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1);
|
|
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
|
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
|
const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone));
|
|
const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone));
|
|
|
|
signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0);
|
|
signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1);
|
|
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
|
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
|
const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone));
|
|
const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone));
|
|
|
|
signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0);
|
|
signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1);
|
|
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
|
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
|
const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone));
|
|
const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone));
|
|
|
|
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
|
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
|
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
|
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
|
const __m128i dot3_0 = _mm_maddubs_epi16(q2_3_0, q8s_3_0);
|
|
const __m128i dot3_1 = _mm_maddubs_epi16(q2_3_1, q8s_3_1);
|
|
const __m128i dot4_0 = _mm_maddubs_epi16(q2_4_0, q8s_4_0);
|
|
const __m128i dot4_1 = _mm_maddubs_epi16(q2_4_1, q8s_4_1);
|
|
|
|
__m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0));
|
|
const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp);
|
|
const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
|
sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1));
|
|
const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp);
|
|
const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
|
sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2));
|
|
const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp);
|
|
const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
|
sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3));
|
|
const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp);
|
|
const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
|
|
|
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0));
|
|
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1));
|
|
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0));
|
|
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1));
|
|
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0));
|
|
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1));
|
|
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0));
|
|
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1));
|
|
}
|
|
|
|
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
|
|
|
}
|
|
|
|
*s = 0.125f * hsum_float_8(accumf);
|
|
|
|
#else
|
|
|
|
float sumf = 0.f;
|
|
for (int i = 0; i < nb; ++i) {
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
int32_t bsum = 0;
|
|
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
|
|
const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
|
|
int32_t sumi = 0;
|
|
for (int l = 0; l < 2; ++l) {
|
|
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
|
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
|
for (int j = 0; j < 8; ++j) {
|
|
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
}
|
|
q8 += 8;
|
|
}
|
|
bsum += sumi * ls1;
|
|
sumi = 0;
|
|
for (int l = 2; l < 4; ++l) {
|
|
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
|
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
|
for (int j = 0; j < 8; ++j) {
|
|
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
}
|
|
q8 += 8;
|
|
}
|
|
bsum += sumi * ls2;
|
|
q2 += 4;
|
|
}
|
|
sumf += d * bsum;
|
|
}
|
|
*s = 0.125f * sumf;
|
|
#endif
|
|
}
|
|
|
|
void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
assert(n % QK_K == 0);
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const block_iq2_s * GGML_RESTRICT x = vx;
|
|
const block_q8_K * GGML_RESTRICT y = vy;
|
|
|
|
const int nb = n / QK_K;
|
|
|
|
#if defined(__AVX2__)
|
|
|
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
|
};
|
|
|
|
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
|
};
|
|
|
|
const __m128i m4 = _mm_set1_epi8(0xf);
|
|
const __m128i m1 = _mm_set1_epi8(1);
|
|
|
|
const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
|
|
const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
|
|
|
|
uint64_t aux64;
|
|
|
|
__m256 accumf = _mm256_setzero_ps();
|
|
for (int i = 0; i < nb; ++i) {
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
|
|
memcpy(&aux64, x[i].scales, 8);
|
|
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
|
|
const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
|
|
|
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
__m256i sumi2 = _mm256_setzero_si256();
|
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
|
|
iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
|
|
iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
|
|
iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
|
|
const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
|
|
iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
|
|
iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
|
|
iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
|
|
qs += 8;
|
|
|
|
__m256i aux256 = _mm256_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
|
|
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
|
const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
|
|
const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
|
|
|
|
aux256 = _mm256_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
|
|
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
|
const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
|
|
const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
|
|
|
|
signs += 4;
|
|
|
|
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
|
|
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
|
|
|
|
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
|
|
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
|
|
sumi1 = _mm256_add_epi32(sumi1, p1);
|
|
sumi2 = _mm256_add_epi32(sumi2, p2);
|
|
}
|
|
|
|
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
|
|
|
}
|
|
|
|
*s = 0.125f * hsum_float_8(accumf);
|
|
|
|
#elif defined(__AVX__)
|
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
|
};
|
|
|
|
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
|
};
|
|
|
|
const __m128i m4 = _mm_set1_epi8(0xf);
|
|
const __m128i m1 = _mm_set1_epi8(1);
|
|
|
|
const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
|
|
const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
|
|
const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
|
|
const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
|
|
|
|
uint64_t aux64;
|
|
|
|
__m256 accumf = _mm256_setzero_ps();
|
|
for (int i = 0; i < nb; ++i) {
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
|
|
memcpy(&aux64, x[i].scales, 8);
|
|
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
|
|
const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8);
|
|
const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8));
|
|
|
|
__m128i sumi1_0 = _mm_setzero_si128();
|
|
__m128i sumi1_1 = _mm_setzero_si128();
|
|
__m128i sumi2_0 = _mm_setzero_si128();
|
|
__m128i sumi2_1 = _mm_setzero_si128();
|
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
|
|
iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
|
|
const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
|
|
iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]);
|
|
const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
|
|
iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
|
|
const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
|
|
iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]);
|
|
qs += 8;
|
|
|
|
__m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
|
|
__m128i aux128_1 = aux128_0;
|
|
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
|
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
|
const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
|
const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
|
const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
|
|
const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
|
|
|
|
aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
|
|
aux128_1 = aux128_0;
|
|
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
|
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
|
const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
|
const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
|
const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
|
|
const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
|
|
|
|
signs += 4;
|
|
|
|
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
|
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
|
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
|
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
|
|
|
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0)));
|
|
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1)));
|
|
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0)));
|
|
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1)));
|
|
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
|
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
|
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
|
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
|
}
|
|
|
|
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
|
|
|
}
|
|
|
|
*s = 0.125f * hsum_float_8(accumf);
|
|
|
|
#else
|
|
|
|
float sumf = 0;
|
|
for (int i = 0; i < nb; i++) {
|
|
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
const int8_t * q8 = y[i].qs;
|
|
const uint8_t * qs = x[i].qs;
|
|
const uint8_t * qh = x[i].qh;
|
|
const uint8_t * signs = qs + QK_K/8;
|
|
|
|
int bsum = 0;
|
|
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
|
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
|
int sumi1 = 0, sumi2 = 0;
|
|
for (int l = 0; l < 2; ++l) {
|
|
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
for (int j = 0; j < 8; ++j) {
|
|
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
|
}
|
|
q8 += 8;
|
|
}
|
|
for (int l = 2; l < 4; ++l) {
|
|
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
for (int j = 0; j < 8; ++j) {
|
|
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
|
}
|
|
q8 += 8;
|
|
}
|
|
bsum += ls1 * sumi1 + ls2 * sumi2;
|
|
qs += 4;
|
|
signs += 4;
|
|
}
|
|
|
|
sumf += d * bsum;
|
|
}
|
|
|
|
*s = 0.125f * sumf;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
assert(n % QK_K == 0);
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const block_iq3_xxs * GGML_RESTRICT x = vx;
|
|
const block_q8_K * GGML_RESTRICT y = vy;
|
|
|
|
const int nb = n / QK_K;
|
|
|
|
#if defined(__AVX2__)
|
|
|
|
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
|
|
|
uint32_t aux32[2];
|
|
|
|
__m256 accumf = _mm256_setzero_ps();
|
|
for (int i = 0; i < nb; ++i) {
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
__m256i sumi2 = _mm256_setzero_si256();
|
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
|
|
iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
|
q3 += 8;
|
|
const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
|
|
iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
|
q3 += 8;
|
|
memcpy(aux32, gas, 8); gas += 8;
|
|
const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
|
|
signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
|
|
const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
|
|
signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
|
|
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
|
|
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
|
|
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
|
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
|
const uint16_t ls1 = aux32[0] >> 28;
|
|
const uint16_t ls2 = aux32[1] >> 28;
|
|
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
|
|
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
|
|
sumi1 = _mm256_add_epi32(sumi1, p1);
|
|
sumi2 = _mm256_add_epi32(sumi2, p2);
|
|
}
|
|
|
|
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
|
|
|
}
|
|
|
|
*s = 0.25f * hsum_float_8(accumf);
|
|
|
|
#elif defined(__AVX__)
|
|
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
|
|
|
uint32_t aux32[2];
|
|
|
|
__m256 accumf = _mm256_setzero_ps();
|
|
for (int i = 0; i < nb; ++i) {
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
__m128i sumi1_0 = _mm_setzero_si128();
|
|
__m128i sumi1_1 = _mm_setzero_si128();
|
|
__m128i sumi2_0 = _mm_setzero_si128();
|
|
__m128i sumi2_1 = _mm_setzero_si128();
|
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
|
const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
|
|
q3 += 8;
|
|
const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
|
const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
|
|
q3 += 8;
|
|
memcpy(aux32, gas, 8); gas += 8;
|
|
const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
|
|
const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]);
|
|
const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
|
|
const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
|
|
const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
|
|
const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
|
|
const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
|
|
const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
|
|
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
|
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
|
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
|
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
|
const uint16_t ls1 = aux32[0] >> 28;
|
|
const uint16_t ls2 = aux32[1] >> 28;
|
|
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
|
|
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
|
|
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
|
|
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
|
|
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
|
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
|
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
|
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
|
}
|
|
|
|
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
|
|
|
}
|
|
|
|
*s = 0.25f * hsum_float_8(accumf);
|
|
|
|
#else
|
|
|
|
uint32_t aux32;
|
|
|
|
float sumf = 0.f;
|
|
for (int i = 0; i < nb; ++i) {
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
int32_t bsum = 0;
|
|
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
|
const uint32_t ls = 2*(aux32 >> 28) + 1;
|
|
int32_t sumi = 0;
|
|
for (int l = 0; l < 4; ++l) {
|
|
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
|
|
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
|
|
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
|
for (int j = 0; j < 4; ++j) {
|
|
sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
|
|
sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
|
|
}
|
|
q8 += 8;
|
|
}
|
|
q3 += 8;
|
|
bsum += sumi * ls;
|
|
}
|
|
sumf += d * bsum;
|
|
}
|
|
*s = 0.25f * sumf;
|
|
#endif
|
|
}
|
|
|
|
void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
assert(n % QK_K == 0);
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const block_iq3_s * GGML_RESTRICT x = vx;
|
|
const block_q8_K * GGML_RESTRICT y = vy;
|
|
|
|
const int nb = n / QK_K;
|
|
|
|
#if defined(__AVX2__)
|
|
|
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
|
};
|
|
|
|
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
|
};
|
|
|
|
const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
|
|
const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
|
|
|
|
const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
|
|
const __m256i idx_mask = _mm256_set1_epi32(256);
|
|
|
|
typedef union {
|
|
__m256i vec[2];
|
|
uint32_t index[16];
|
|
} index_t;
|
|
|
|
index_t idx;
|
|
|
|
__m256 accumf = _mm256_setzero_ps();
|
|
for (int i = 0; i < nb; ++i) {
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
__m256i sumi2 = _mm256_setzero_si256();
|
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16;
|
|
idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]);
|
|
idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]);
|
|
idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask);
|
|
idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask);
|
|
idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l)));
|
|
idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1)));
|
|
|
|
// At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
|
|
//const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
|
|
//const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
|
|
const __m256i q2_1 = _mm256_set_epi32(
|
|
iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
|
|
iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
|
|
);
|
|
const __m256i q2_2 = _mm256_set_epi32(
|
|
iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
|
|
iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
|
|
);
|
|
|
|
__m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
|
|
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
|
const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
|
|
const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
|
|
|
|
aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
|
|
aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
|
|
const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
|
|
const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
|
|
|
|
signs += 4;
|
|
|
|
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
|
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
|
const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
|
|
const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
|
|
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
|
|
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
|
|
sumi1 = _mm256_add_epi32(sumi1, p1);
|
|
sumi2 = _mm256_add_epi32(sumi2, p2);
|
|
}
|
|
|
|
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
|
|
|
}
|
|
|
|
*s = hsum_float_8(accumf);
|
|
|
|
#elif defined(__AVX__)
|
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
|
};
|
|
|
|
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
|
};
|
|
|
|
const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
|
|
const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
|
|
const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
|
|
const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
|
|
|
|
const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256);
|
|
const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16);
|
|
const __m128i idx_mask = _mm_set1_epi32(256);
|
|
|
|
typedef union {
|
|
__m128i vec[4];
|
|
uint32_t index[16];
|
|
} index_t;
|
|
|
|
index_t idx;
|
|
|
|
__m256 accumf = _mm256_setzero_ps();
|
|
for (int i = 0; i < nb; ++i) {
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
__m128i sumi1_0 = _mm_setzero_si128();
|
|
__m128i sumi1_1 = _mm_setzero_si128();
|
|
__m128i sumi2_0 = _mm_setzero_si128();
|
|
__m128i sumi2_1 = _mm_setzero_si128();
|
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs);
|
|
const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp);
|
|
const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16;
|
|
idx.vec[0] = _mm_set1_epi32(qh[ib32+0]);
|
|
idx.vec[1] = idx.vec[0];
|
|
idx.vec[2] = _mm_set1_epi32(qh[ib32+1]);
|
|
idx.vec[3] = idx.vec[2];
|
|
|
|
idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask);
|
|
idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask);
|
|
idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask);
|
|
idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask);
|
|
|
|
idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0));
|
|
idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8)));
|
|
idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1));
|
|
idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8)));
|
|
|
|
const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]);
|
|
const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]);
|
|
const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]);
|
|
const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]);
|
|
|
|
__m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16));
|
|
__m128i aux128_1 = aux128_0;
|
|
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
|
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
|
const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
|
const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
|
const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
|
|
const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
|
|
|
|
aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16));
|
|
aux128_1 = aux128_0;
|
|
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
|
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
|
const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
|
const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
|
const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
|
|
const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
|
|
|
|
signs += 4;
|
|
|
|
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
|
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
|
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
|
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
|
const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
|
|
const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
|
|
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
|
|
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
|
|
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
|
|
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
|
|
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
|
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
|
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
|
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
|
}
|
|
|
|
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
|
|
|
}
|
|
|
|
*s = hsum_float_8(accumf);
|
|
|
|
#else
|
|
|
|
float sumf = 0.f;
|
|
for (int i = 0; i < nb; ++i) {
|
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
|
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
int32_t bsum = 0;
|
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
|
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
|
int32_t sumi = 0;
|
|
for (int l = 0; l < 4; ++l) {
|
|
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
|
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
|
for (int j = 0; j < 4; ++j) {
|
|
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
}
|
|
q8 += 8;
|
|
}
|
|
qs += 8;
|
|
signs += 4;
|
|
bsum += sumi * ls1;
|
|
sumi = 0;
|
|
for (int l = 0; l < 4; ++l) {
|
|
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
|
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
|
for (int j = 0; j < 4; ++j) {
|
|
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
}
|
|
q8 += 8;
|
|
}
|
|
qs += 8;
|
|
signs += 4;
|
|
bsum += sumi * ls2;
|
|
}
|
|
sumf += d * bsum;
|
|
}
|
|
*s = sumf;
|
|
#endif
|
|
}
|
|
|
|
#if defined(__AVX2__)
|
|
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
|
const __m256i ax = _mm256_sign_epi8(x, x);
|
|
const __m256i sy = _mm256_sign_epi8(y, x);
|
|
return _mm256_maddubs_epi16(ax, sy);
|
|
}
|
|
#endif
|
|
|
|
void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
assert(n % QK_K == 0);
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const block_iq1_s * GGML_RESTRICT x = vx;
|
|
const block_q8_K * GGML_RESTRICT y = vy;
|
|
|
|
const int nb = n / QK_K;
|
|
|
|
#if defined __AVX2__
|
|
|
|
__m256 accum = _mm256_setzero_ps();
|
|
float accum1 = 0;
|
|
for (int i = 0; i < nb; ++i) {
|
|
|
|
const int8_t * q8 = y[i].qs;
|
|
const uint8_t * qs = x[i].qs;
|
|
const uint16_t * qh = x[i].qh;
|
|
|
|
__m256i sumi = _mm256_setzero_si256();
|
|
int sumi1 = 0;
|
|
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
#ifdef __BMI2__
|
|
const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL);
|
|
const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL);
|
|
const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
|
|
const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
|
|
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
|
|
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
|
|
#else
|
|
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
|
|
iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
|
|
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
|
|
iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
|
|
#endif
|
|
qs += 8;
|
|
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
|
|
const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
|
|
const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
|
|
const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
|
|
const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
|
|
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(ls1));
|
|
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(ls2));
|
|
|
|
sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p1, p2));
|
|
sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
|
|
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
|
|
}
|
|
|
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum);
|
|
accum1 += d * sumi1;
|
|
|
|
}
|
|
|
|
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
|
|
|
#elif defined __AVX__
|
|
__m256 accum = _mm256_setzero_ps();
|
|
float accum1 = 0;
|
|
for (int i = 0; i < nb; ++i) {
|
|
|
|
const int8_t * q8 = y[i].qs;
|
|
const uint8_t * qs = x[i].qs;
|
|
const uint16_t * qh = x[i].qh;
|
|
|
|
__m128i sumi1_0 = _mm_setzero_si128();
|
|
__m128i sumi1_1 = _mm_setzero_si128();
|
|
int sumi1 = 0;
|
|
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
|
|
const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]);
|
|
const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
|
|
const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]);
|
|
qs += 8;
|
|
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
|
|
const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
|
|
const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
|
|
const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
|
|
const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
|
|
const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
|
|
const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
|
|
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1));
|
|
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1));
|
|
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2));
|
|
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2));
|
|
|
|
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
|
|
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
|
|
sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
|
|
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
|
|
}
|
|
|
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
|
|
accum1 += d * sumi1;
|
|
|
|
}
|
|
|
|
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
|
|
|
#else
|
|
|
|
float sumf = 0;
|
|
for (int i = 0; i < nb; i++) {
|
|
|
|
const int8_t * q8 = y[i].qs;
|
|
const uint8_t * qs = x[i].qs;
|
|
const uint16_t * qh = x[i].qh;
|
|
|
|
int sumi = 0, sumi1 = 0;
|
|
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
const int ls = 2*((qh[ib] >> 12) & 7) + 1;
|
|
const int delta = qh[ib] & 0x8000 ? -1 : 1;
|
|
int lsum = 0;
|
|
for (int l = 0; l < 4; ++l) {
|
|
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
|
for (int j = 0; j < 8; ++j) {
|
|
lsum += q8[j] * grid[j];
|
|
}
|
|
q8 += 8;
|
|
}
|
|
sumi += ls * lsum;
|
|
sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
|
|
qs += 4;
|
|
}
|
|
|
|
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
|
}
|
|
|
|
*s = sumf;
|
|
|
|
#endif
|
|
}
|
|
|
|
void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
assert(n % QK_K == 0);
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
|
|
const block_iq1_m * GGML_RESTRICT x = vx;
|
|
const block_q8_K * GGML_RESTRICT y = vy;
|
|
|
|
const int nb = n / QK_K;
|
|
|
|
iq1m_scale_t scale;
|
|
|
|
#if defined __AVX2__
|
|
|
|
const __m256i mask = _mm256_set1_epi16(0x7);
|
|
const __m256i mone = _mm256_set1_epi16(1);
|
|
const __m256i mone8 = _mm256_set1_epi8(1);
|
|
const __m256i mtwo8 = _mm256_set1_epi8(2);
|
|
// VPSHUFB cannot cross 128-bit lanes so odd shifts go to upper half.
|
|
const __m256i scales_shift = _mm256_set_epi64x(9, 3, 6, 0);
|
|
|
|
__m256 accum1 = _mm256_setzero_ps();
|
|
__m256 accum2 = _mm256_setzero_ps();
|
|
for (int i = 0; i < nb; ++i) {
|
|
|
|
const int8_t * q8 = y[i].qs;
|
|
const uint8_t * qs = x[i].qs;
|
|
const uint8_t * qh = x[i].qh;
|
|
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
|
|
|
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
|
// Extract 3-bit scales (16 values)
|
|
__m256i scales = _mm256_set1_epi64x(*(const uint64_t*)sc);
|
|
scales = _mm256_srlv_epi64(scales, scales_shift);
|
|
scales = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scales, mask), 1), mone);
|
|
|
|
// Indices to repeat each scale 8 times.
|
|
__m256i scales_idx1 = _mm256_set1_epi16(0x0100);
|
|
__m256i scales_idx2 = _mm256_add_epi8(scales_idx1, _mm256_set1_epi8(8));
|
|
|
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
__m256i sumi2 = _mm256_setzero_si256();
|
|
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
#ifdef __BMI2__
|
|
const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL)
|
|
| _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL);
|
|
const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL)
|
|
| _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL);
|
|
const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
|
|
const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
|
|
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
|
|
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
|
|
|
|
// Convert signs to bytes 0x81 (negative) or 0x01 (positive)
|
|
const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL);
|
|
const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign)));
|
|
const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32)));
|
|
#else
|
|
const __m256i q1b_1 = _mm256_set_epi64x(
|
|
iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
|
|
iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
|
|
);
|
|
const __m256i q1b_2 = _mm256_set_epi64x(
|
|
iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
|
|
iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
|
|
);
|
|
|
|
const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
|
const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
|
#endif
|
|
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
|
|
|
const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
|
|
const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
|
|
const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1));
|
|
const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2));
|
|
|
|
__m256i scale1 = _mm256_shuffle_epi8(scales, scales_idx1);
|
|
__m256i scale2 = _mm256_shuffle_epi8(scales, scales_idx2);
|
|
|
|
scales_idx1 = _mm256_add_epi8(scales_idx1, mtwo8);
|
|
scales_idx2 = _mm256_add_epi8(scales_idx2, mtwo8);
|
|
|
|
const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
|
|
const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
|
|
const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
|
|
const __m256i p4 = _mm256_madd_epi16(dot4, scale2);
|
|
|
|
sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2));
|
|
sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4));
|
|
|
|
qs += 8; qh += 4;
|
|
}
|
|
|
|
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
|
|
|
|
accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
|
|
accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
|
|
}
|
|
|
|
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
|
|
|
#elif defined __AVX__
|
|
const __m128i mask = _mm_set1_epi16(0x7);
|
|
const __m128i mone = _mm_set1_epi16(1);
|
|
|
|
__m256 accum1 = _mm256_setzero_ps();
|
|
__m256 accum2 = _mm256_setzero_ps();
|
|
for (int i = 0; i < nb; ++i) {
|
|
|
|
const int8_t * q8 = y[i].qs;
|
|
const uint8_t * qs = x[i].qs;
|
|
const uint8_t * qh = x[i].qh;
|
|
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
|
|
|
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
|
|
|
__m128i sumi1_0 = _mm_setzero_si128();
|
|
__m128i sumi1_1 = _mm_setzero_si128();
|
|
__m128i sumi2_0 = _mm_setzero_si128();
|
|
__m128i sumi2_1 = _mm_setzero_si128();
|
|
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
const __m128i q1b_1_0 = _mm_set_epi64x(
|
|
iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]);
|
|
const __m128i q1b_1_1 = _mm_set_epi64x(
|
|
iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]);
|
|
const __m128i q1b_2_0 = _mm_set_epi64x(
|
|
iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]);
|
|
const __m128i q1b_2_1 = _mm_set_epi64x(
|
|
iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]);
|
|
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
|
|
const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
|
|
const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
|
|
const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
|
|
const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
|
|
|
|
const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
|
const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
|
const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
|
const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
|
qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
|
|
|
const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0);
|
|
const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1);
|
|
const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0);
|
|
const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1);
|
|
|
|
__m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0);
|
|
__m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3);
|
|
__m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6);
|
|
__m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9);
|
|
|
|
scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone);
|
|
scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone);
|
|
scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone);
|
|
scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone);
|
|
const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0);
|
|
const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1);
|
|
const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0);
|
|
const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1);
|
|
const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0);
|
|
const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1);
|
|
const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0);
|
|
const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1);
|
|
|
|
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
|
|
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
|
|
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0));
|
|
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1));
|
|
|
|
qs += 8; qh += 4;
|
|
}
|
|
|
|
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
|
|
|
|
accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
|
|
accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
|
|
}
|
|
|
|
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
|
|
|
#else
|
|
|
|
int sum1[2], sum2[2], delta[4];
|
|
|
|
float sumf = 0;
|
|
for (int i = 0; i < nb; i++) {
|
|
|
|
const int8_t * q8 = y[i].qs;
|
|
const uint8_t * qs = x[i].qs;
|
|
const uint8_t * qh = x[i].qh;
|
|
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
|
|
|
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
|
|
|
int sumi1 = 0, sumi2 = 0;
|
|
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
delta[0] = qh[0] & 0x08 ? -1 : 1;
|
|
delta[1] = qh[0] & 0x80 ? -1 : 1;
|
|
delta[2] = qh[1] & 0x08 ? -1 : 1;
|
|
delta[3] = qh[1] & 0x80 ? -1 : 1;
|
|
sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
|
|
for (int l = 0; l < 4; ++l) {
|
|
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
|
|
int lsum1 = 0, lsum2 = 0;
|
|
for (int j = 0; j < 8; ++j) {
|
|
lsum1 += q8[j] * grid[j];
|
|
lsum2 += q8[j];
|
|
}
|
|
q8 += 8;
|
|
sum1[l/2] += lsum1;
|
|
sum2[l/2] += lsum2*delta[l];
|
|
}
|
|
|
|
const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
|
|
const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
|
|
|
|
sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
|
|
sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
|
|
qs += 4;
|
|
qh += 2;
|
|
}
|
|
|
|
sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
|
}
|
|
|
|
*s = sumf;
|
|
|
|
#endif
|
|
}
|
|
|
|
void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
assert(n % QK4_NL == 0);
|
|
static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
|
|
|
|
const block_iq4_nl * GGML_RESTRICT x = vx;
|
|
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
|
|
const int nb = n / QK4_NL;
|
|
|
|
int ib = 0;
|
|
float sumf = 0;
|
|
|
|
#if defined __AVX2__
|
|
|
|
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
|
const __m128i m4b = _mm_set1_epi8(0x0f);
|
|
const __m256i mone = _mm256_set1_epi16(1);
|
|
|
|
__m256 accum1 = _mm256_setzero_ps();
|
|
__m256 accum2 = _mm256_setzero_ps();
|
|
for (; ib + 1 < nb; ib += 2) {
|
|
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
|
|
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
|
|
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
|
|
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
|
|
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
|
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
|
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
|
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
|
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
|
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
|
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
|
const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
|
|
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
|
|
_mm256_cvtepi32_ps(p_1), accum1);
|
|
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
|
|
_mm256_cvtepi32_ps(p_2), accum2);
|
|
}
|
|
|
|
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
|
|
|
#elif defined __AVX__
|
|
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
|
const __m128i m4b = _mm_set1_epi8(0x0f);
|
|
|
|
__m256 accum = _mm256_setzero_ps();
|
|
for (; ib + 1 < nb; ib += 2) {
|
|
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
|
|
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
|
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
|
|
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
|
|
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
|
|
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
|
|
|
|
const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
|
|
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
|
|
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
|
|
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
|
|
|
|
const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
|
|
const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
|
|
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
|
|
}
|
|
|
|
sumf = hsum_float_8(accum);
|
|
|
|
#endif
|
|
for (; ib < nb; ++ib) {
|
|
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
|
|
int sumi1 = 0, sumi2 = 0;
|
|
for (int j = 0; j < QK4_NL/2; ++j) {
|
|
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
|
sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
|
|
}
|
|
sumf += d * (sumi1 + sumi2);
|
|
}
|
|
*s = sumf;
|
|
}
|
|
|
|
void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
assert(nrc == 1);
|
|
UNUSED(nrc);
|
|
UNUSED(bx);
|
|
UNUSED(by);
|
|
UNUSED(bs);
|
|
assert(n % QK_K == 0);
|
|
|
|
const block_iq4_xs * GGML_RESTRICT x = vx;
|
|
const block_q8_K * GGML_RESTRICT y = vy;
|
|
|
|
const int nb = n / QK_K;
|
|
|
|
#if defined __AVX2__
|
|
|
|
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
|
const __m128i m4b = _mm_set1_epi8(0x0f);
|
|
|
|
__m256 accum = _mm256_setzero_ps();
|
|
for (int ibl = 0; ibl < nb; ++ibl) {
|
|
const uint8_t * qs = x[ibl].qs;
|
|
const int8_t * q8 = y[ibl].qs;
|
|
uint16_t sh = x[ibl].scales_h;
|
|
__m256i sumi1 = _mm256_setzero_si256();
|
|
__m256i sumi2 = _mm256_setzero_si256();
|
|
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
|
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
|
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
|
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
|
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
|
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
|
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
|
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
|
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
|
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
|
const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
|
|
sh >>= 4;
|
|
const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1));
|
|
const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2));
|
|
sumi1 = _mm256_add_epi32(p_1, sumi1);
|
|
sumi2 = _mm256_add_epi32(p_2, sumi2);
|
|
}
|
|
accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
|
_mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
|
|
}
|
|
|
|
*s = hsum_float_8(accum);
|
|
|
|
#elif defined __AVX__
|
|
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
|
const __m128i m4b = _mm_set1_epi8(0x0f);
|
|
|
|
__m256 accum = _mm256_setzero_ps();
|
|
for (int ibl = 0; ibl < nb; ++ibl) {
|
|
const uint8_t * qs = x[ibl].qs;
|
|
const int8_t * q8 = y[ibl].qs;
|
|
uint16_t sh = x[ibl].scales_h;
|
|
__m128i sumi1_0 = _mm_setzero_si128();
|
|
__m128i sumi1_1 = _mm_setzero_si128();
|
|
__m128i sumi2_0 = _mm_setzero_si128();
|
|
__m128i sumi2_1 = _mm_setzero_si128();
|
|
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
|
|
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
|
|
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
|
const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
|
|
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
|
|
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
|
|
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
|
|
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
|
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
|
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
|
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
|
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
|
const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
|
|
sh >>= 4;
|
|
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1));
|
|
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1));
|
|
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2));
|
|
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2));
|
|
sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0);
|
|
sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1);
|
|
sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0);
|
|
sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1);
|
|
}
|
|
__m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
|
|
__m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
|
|
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
|
_mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
|
|
}
|
|
|
|
*s = hsum_float_8(accum);
|
|
|
|
#else
|
|
float sumf = 0;
|
|
for (int ibl = 0; ibl < nb; ++ibl) {
|
|
const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
|
uint16_t h = x[ibl].scales_h;
|
|
const uint8_t * qs = x[ibl].qs;
|
|
const int8_t * q8 = y[ibl].qs;
|
|
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
|
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
|
h >>= 4;
|
|
const float d1 = d4d8*(ls1 - 32);
|
|
const float d2 = d4d8*(ls2 - 32);
|
|
int sumi1 = 0, sumi2 = 0;
|
|
for (int j = 0; j < 16; ++j) {
|
|
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
}
|
|
sumf += d1 * (sumi1 + sumi2);
|
|
qs += 16;
|
|
q8 += 32;
|
|
sumi1 = sumi2 = 0;
|
|
for (int j = 0; j < 16; ++j) {
|
|
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
}
|
|
sumf += d2 * (sumi1 + sumi2);
|
|
qs += 16;
|
|
q8 += 32;
|
|
}
|
|
}
|
|
*s = sumf;
|
|
#endif
|
|
}
|
|
|