549 lines
		
	
	
		
			24 KiB
		
	
	
	
		
			LLVM
		
	
	
	
			
		
		
	
	
			549 lines
		
	
	
		
			24 KiB
		
	
	
	
		
			LLVM
		
	
	
	
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 | |
| ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=AVXVNNI
 | |
| ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=AVX512,AVX512VNNI
 | |
| ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVNNI
 | |
| 
 | |
| define i32 @no_dpbusd(i8 *%a, i8 *%b, i32 %c, i32 %n) {
 | |
| ; AVXVNNI-LABEL: no_dpbusd:
 | |
| ; AVXVNNI:       # %bb.0: # %entry
 | |
| ; AVXVNNI-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 | |
| ; AVXVNNI-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 | |
| ; AVXVNNI-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
 | |
| ; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVXVNNI-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVXVNNI-NEXT:    addl %edx, %eax
 | |
| ; AVXVNNI-NEXT:    vzeroupper
 | |
| ; AVXVNNI-NEXT:    retq
 | |
| ;
 | |
| ; AVX512-LABEL: no_dpbusd:
 | |
| ; AVX512:       # %bb.0: # %entry
 | |
| ; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 | |
| ; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 | |
| ; AVX512-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
 | |
| ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 | |
| ; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 | |
| ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 | |
| ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 | |
| ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVX512-NEXT:    addl %edx, %eax
 | |
| ; AVX512-NEXT:    vzeroupper
 | |
| ; AVX512-NEXT:    retq
 | |
| entry:
 | |
|   %0 = bitcast i8* %a to <16 x i8>*
 | |
|   %1 = load <16 x i8>, <16 x i8>* %0, align 16
 | |
|   %2 = zext <16 x i8> %1 to <16 x i32>
 | |
|   %3 = bitcast i8* %b to <16 x i8>*
 | |
|   %4 = load <16 x i8>, <16 x i8>* %3, align 16
 | |
|   %5 = zext <16 x i8> %4 to <16 x i32>
 | |
|   %6 = mul nsw <16 x i32> %5, %2
 | |
|   %7 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6)
 | |
|   %op.extra = add nsw i32 %7, %c
 | |
|   ret i32 %op.extra
 | |
| }
 | |
| 
 | |
| define i32 @vpdpbusd_mutate(i8 *%a, i8 *%b, i32 %c, i32 %n) {
 | |
| ; AVXVNNI-LABEL: vpdpbusd_mutate:
 | |
| ; AVXVNNI:       # %bb.0: # %entry
 | |
| ; AVXVNNI-NEXT:    vmovdqa (%rsi), %xmm0
 | |
| ; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | |
| ; AVXVNNI-NEXT:    {vex} vpdpbusd (%rdi), %xmm0, %xmm1
 | |
| ; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 | |
| ; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVXVNNI-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVXVNNI-NEXT:    addl %edx, %eax
 | |
| ; AVXVNNI-NEXT:    retq
 | |
| ;
 | |
| ; AVX512VNNI-LABEL: vpdpbusd_mutate:
 | |
| ; AVX512VNNI:       # %bb.0: # %entry
 | |
| ; AVX512VNNI-NEXT:    vmovdqa (%rdi), %xmm0
 | |
| ; AVX512VNNI-NEXT:    vmovdqa (%rsi), %xmm1
 | |
| ; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | |
| ; AVX512VNNI-NEXT:    vpdpbusd %zmm0, %zmm1, %zmm2
 | |
| ; AVX512VNNI-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
 | |
| ; AVX512VNNI-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
 | |
| ; AVX512VNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 | |
| ; AVX512VNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512VNNI-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVX512VNNI-NEXT:    addl %edx, %eax
 | |
| ; AVX512VNNI-NEXT:    vzeroupper
 | |
| ; AVX512VNNI-NEXT:    retq
 | |
| ;
 | |
| ; AVX512VLVNNI-LABEL: vpdpbusd_mutate:
 | |
| ; AVX512VLVNNI:       # %bb.0: # %entry
 | |
| ; AVX512VLVNNI-NEXT:    vmovdqa (%rsi), %xmm0
 | |
| ; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | |
| ; AVX512VLVNNI-NEXT:    vpdpbusd (%rdi), %xmm0, %xmm1
 | |
| ; AVX512VLVNNI-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 | |
| ; AVX512VLVNNI-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 | |
| ; AVX512VLVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 | |
| ; AVX512VLVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512VLVNNI-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVX512VLVNNI-NEXT:    addl %edx, %eax
 | |
| ; AVX512VLVNNI-NEXT:    retq
 | |
| entry:
 | |
|   %0 = bitcast i8* %a to <16 x i8>*
 | |
|   %1 = load <16 x i8>, <16 x i8>* %0, align 16
 | |
|   %2 = sext <16 x i8> %1 to <16 x i32>
 | |
|   %3 = bitcast i8* %b to <16 x i8>*
 | |
|   %4 = load <16 x i8>, <16 x i8>* %3, align 16
 | |
|   %5 = zext <16 x i8> %4 to <16 x i32>
 | |
|   %6 = mul nsw <16 x i32> %5, %2
 | |
|   %7 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6)
 | |
|   %op.extra = add nsw i32 %7, %c
 | |
|   ret i32 %op.extra
 | |
| }
 | |
| 
 | |
| define i32 @mul_zext(i8 *%a, i8 *%b, i32 %c, i32 %n) {
 | |
| ; AVXVNNI-LABEL: mul_zext:
 | |
| ; AVXVNNI:       # %bb.0: # %entry
 | |
| ; AVXVNNI-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 | |
| ; AVXVNNI-NEXT:    vpmovsxbw (%rsi), %ymm1
 | |
| ; AVXVNNI-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
 | |
| ; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
 | |
| ; AVXVNNI-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 | |
| ; AVXVNNI-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 | |
| ; AVXVNNI-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 | |
| ; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVXVNNI-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVXVNNI-NEXT:    addl %edx, %eax
 | |
| ; AVXVNNI-NEXT:    vzeroupper
 | |
| ; AVXVNNI-NEXT:    retq
 | |
| ;
 | |
| ; AVX512-LABEL: mul_zext:
 | |
| ; AVX512:       # %bb.0: # %entry
 | |
| ; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 | |
| ; AVX512-NEXT:    vpmovsxbw (%rsi), %ymm1
 | |
| ; AVX512-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
 | |
| ; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 | |
| ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 | |
| ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 | |
| ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 | |
| ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 | |
| ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 | |
| ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVX512-NEXT:    addl %edx, %eax
 | |
| ; AVX512-NEXT:    vzeroupper
 | |
| ; AVX512-NEXT:    retq
 | |
| entry:
 | |
|   %0 = bitcast i8* %a to <16 x i8>*
 | |
|   %1 = load <16 x i8>, <16 x i8>* %0, align 16
 | |
|   %2 = zext <16 x i8> %1 to <16 x i16>
 | |
|   %3 = bitcast i8* %b to <16 x i8>*
 | |
|   %4 = load <16 x i8>, <16 x i8>* %3, align 16
 | |
|   %5 = sext <16 x i8> %4 to <16 x i16>
 | |
|   %6 = mul nsw <16 x i16> %5, %2
 | |
|   ; We can't combine to vpdpbusd for zext, because each of the 4 multiplies
 | |
|   ; done by vpdpbusd compute a signed 16-bit product that will be sign extended
 | |
|   ; before adding into the accumulator.
 | |
|   %7 = zext <16 x i16> %6 to <16 x i32>
 | |
|   %8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7)
 | |
|   %op.extra = add nsw i32 %8, %c
 | |
|   ret i32 %op.extra
 | |
| }
 | |
| 
 | |
| define i32 @mul_sext(i8 *%a, i8 *%b, i32 %c, i32 %n) {
 | |
| ; AVXVNNI-LABEL: mul_sext:
 | |
| ; AVXVNNI:       # %bb.0: # %entry
 | |
| ; AVXVNNI-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 | |
| ; AVXVNNI-NEXT:    vpmovsxbw (%rsi), %ymm1
 | |
| ; AVXVNNI-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
 | |
| ; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
 | |
| ; AVXVNNI-NEXT:    vpmovsxwd %xmm1, %ymm1
 | |
| ; AVXVNNI-NEXT:    vpmovsxwd %xmm0, %ymm0
 | |
| ; AVXVNNI-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 | |
| ; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVXVNNI-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVXVNNI-NEXT:    addl %edx, %eax
 | |
| ; AVXVNNI-NEXT:    vzeroupper
 | |
| ; AVXVNNI-NEXT:    retq
 | |
| ;
 | |
| ; AVX512-LABEL: mul_sext:
 | |
| ; AVX512:       # %bb.0: # %entry
 | |
| ; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
 | |
| ; AVX512-NEXT:    vpmovsxbw (%rsi), %ymm1
 | |
| ; AVX512-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
 | |
| ; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
 | |
| ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 | |
| ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 | |
| ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 | |
| ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 | |
| ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 | |
| ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVX512-NEXT:    addl %edx, %eax
 | |
| ; AVX512-NEXT:    vzeroupper
 | |
| ; AVX512-NEXT:    retq
 | |
| entry:
 | |
|   %0 = bitcast i8* %a to <16 x i8>*
 | |
|   %1 = load <16 x i8>, <16 x i8>* %0, align 16
 | |
|   %2 = zext <16 x i8> %1 to <16 x i16>
 | |
|   %3 = bitcast i8* %b to <16 x i8>*
 | |
|   %4 = load <16 x i8>, <16 x i8>* %3, align 16
 | |
|   %5 = sext <16 x i8> %4 to <16 x i16>
 | |
|   %6 = mul nsw <16 x i16> %5, %2
 | |
|   ; TODO:
 | |
|   ; We also need to verify that the multiply has at least 2x the number of bits
 | |
|   ; of the input. We shouldn't match
 | |
|   ; (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
 | |
|   %7 = sext <16 x i16> %6 to <16 x i32>
 | |
|   %8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7)
 | |
|   %op.extra = add nsw i32 %8, %c
 | |
|   ret i32 %op.extra
 | |
| }
 | |
| 
 | |
| define i32 @vpdpbusd_512(i8 *%a, i8 *%b, i32 %c, i32 %n) {
 | |
| ; AVXVNNI-LABEL: vpdpbusd_512:
 | |
| ; AVXVNNI:       # %bb.0: # %entry
 | |
| ; AVXVNNI-NEXT:    vmovdqa (%rdi), %xmm0
 | |
| ; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | |
| ; AVXVNNI-NEXT:    {vex} vpdpbusd (%rsi), %xmm0, %xmm1
 | |
| ; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 | |
| ; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVXVNNI-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVXVNNI-NEXT:    addl %edx, %eax
 | |
| ; AVXVNNI-NEXT:    retq
 | |
| ;
 | |
| ; AVX512VNNI-LABEL: vpdpbusd_512:
 | |
| ; AVX512VNNI:       # %bb.0: # %entry
 | |
| ; AVX512VNNI-NEXT:    vmovdqa (%rdi), %xmm0
 | |
| ; AVX512VNNI-NEXT:    vmovdqa (%rsi), %xmm1
 | |
| ; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | |
| ; AVX512VNNI-NEXT:    vpdpbusd %zmm1, %zmm0, %zmm2
 | |
| ; AVX512VNNI-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
 | |
| ; AVX512VNNI-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
 | |
| ; AVX512VNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 | |
| ; AVX512VNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512VNNI-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVX512VNNI-NEXT:    addl %edx, %eax
 | |
| ; AVX512VNNI-NEXT:    vzeroupper
 | |
| ; AVX512VNNI-NEXT:    retq
 | |
| ;
 | |
| ; AVX512VLVNNI-LABEL: vpdpbusd_512:
 | |
| ; AVX512VLVNNI:       # %bb.0: # %entry
 | |
| ; AVX512VLVNNI-NEXT:    vmovdqa (%rdi), %xmm0
 | |
| ; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | |
| ; AVX512VLVNNI-NEXT:    vpdpbusd (%rsi), %xmm0, %xmm1
 | |
| ; AVX512VLVNNI-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 | |
| ; AVX512VLVNNI-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 | |
| ; AVX512VLVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 | |
| ; AVX512VLVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512VLVNNI-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVX512VLVNNI-NEXT:    addl %edx, %eax
 | |
| ; AVX512VLVNNI-NEXT:    retq
 | |
| entry:
 | |
|   %0 = bitcast i8* %a to <16 x i8>*
 | |
|   %1 = load <16 x i8>, <16 x i8>* %0, align 16
 | |
|   %2 = zext <16 x i8> %1 to <16 x i32>
 | |
|   %3 = bitcast i8* %b to <16 x i8>*
 | |
|   %4 = load <16 x i8>, <16 x i8>* %3, align 16
 | |
|   %5 = sext <16 x i8> %4 to <16 x i32>
 | |
|   %6 = mul nsw <16 x i32> %5, %2
 | |
|   %7 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6)
 | |
|   %op.extra = add nsw i32 %7, %c
 | |
|   ret i32 %op.extra
 | |
| }
 | |
| 
 | |
| declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
 | |
| 
 | |
| define i32 @vpdpbusd_256(i8 *%a, i8 *%b, i32 %c, i32 %n) {
 | |
| ; AVXVNNI-LABEL: vpdpbusd_256:
 | |
| ; AVXVNNI:       # %bb.0: # %entry
 | |
| ; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 | |
| ; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 | |
| ; AVXVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | |
| ; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm0, %xmm1, %xmm2
 | |
| ; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
 | |
| ; AVXVNNI-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVXVNNI-NEXT:    addl %edx, %eax
 | |
| ; AVXVNNI-NEXT:    retq
 | |
| ;
 | |
| ; AVX512VNNI-LABEL: vpdpbusd_256:
 | |
| ; AVX512VNNI:       # %bb.0: # %entry
 | |
| ; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 | |
| ; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 | |
| ; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | |
| ; AVX512VNNI-NEXT:    vpdpbusd %zmm0, %zmm1, %zmm2
 | |
| ; AVX512VNNI-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
 | |
| ; AVX512VNNI-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
 | |
| ; AVX512VNNI-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVX512VNNI-NEXT:    addl %edx, %eax
 | |
| ; AVX512VNNI-NEXT:    vzeroupper
 | |
| ; AVX512VNNI-NEXT:    retq
 | |
| ;
 | |
| ; AVX512VLVNNI-LABEL: vpdpbusd_256:
 | |
| ; AVX512VLVNNI:       # %bb.0: # %entry
 | |
| ; AVX512VLVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 | |
| ; AVX512VLVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 | |
| ; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | |
| ; AVX512VLVNNI-NEXT:    vpdpbusd %xmm0, %xmm1, %xmm2
 | |
| ; AVX512VLVNNI-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
 | |
| ; AVX512VLVNNI-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
 | |
| ; AVX512VLVNNI-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVX512VLVNNI-NEXT:    addl %edx, %eax
 | |
| ; AVX512VLVNNI-NEXT:    retq
 | |
| entry:
 | |
|   %0 = bitcast i8* %a to <8 x i8>*
 | |
|   %1 = load <8 x i8>, <8 x i8>* %0, align 8
 | |
|   %2 = zext <8 x i8> %1 to <8 x i32>
 | |
|   %3 = bitcast i8* %b to <8 x i8>*
 | |
|   %4 = load <8 x i8>, <8 x i8>* %3, align 8
 | |
|   %5 = sext <8 x i8> %4 to <8 x i32>
 | |
|   %6 = mul nsw <8 x i32> %5, %2
 | |
|   %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6)
 | |
|   %op.extra = add nsw i32 %7, %c
 | |
|   ret i32 %op.extra
 | |
| }
 | |
| 
 | |
| declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
 | |
| 
 | |
| define i32 @vpdpbusd_128(i8 *%a, i8 *%b, i32 %c, i32 %n) {
 | |
| ; AVXVNNI-LABEL: vpdpbusd_128:
 | |
| ; AVXVNNI:       # %bb.0: # %entry
 | |
| ; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 | |
| ; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 | |
| ; AVXVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | |
| ; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 | |
| ; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
 | |
| ; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm1, %xmm0, %xmm2
 | |
| ; AVXVNNI-NEXT:    vmovd %xmm2, %eax
 | |
| ; AVXVNNI-NEXT:    addl %edx, %eax
 | |
| ; AVXVNNI-NEXT:    retq
 | |
| ;
 | |
| ; AVX512VNNI-LABEL: vpdpbusd_128:
 | |
| ; AVX512VNNI:       # %bb.0: # %entry
 | |
| ; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 | |
| ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | |
| ; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
 | |
| ; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 | |
| ; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7]
 | |
| ; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | |
| ; AVX512VNNI-NEXT:    vpdpbusd %zmm0, %zmm1, %zmm2
 | |
| ; AVX512VNNI-NEXT:    vmovd %xmm2, %eax
 | |
| ; AVX512VNNI-NEXT:    addl %edx, %eax
 | |
| ; AVX512VNNI-NEXT:    vzeroupper
 | |
| ; AVX512VNNI-NEXT:    retq
 | |
| ;
 | |
| ; AVX512VLVNNI-LABEL: vpdpbusd_128:
 | |
| ; AVX512VLVNNI:       # %bb.0: # %entry
 | |
| ; AVX512VLVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 | |
| ; AVX512VLVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 | |
| ; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | |
| ; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
 | |
| ; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
 | |
| ; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | |
| ; AVX512VLVNNI-NEXT:    vpdpbusd %xmm1, %xmm0, %xmm2
 | |
| ; AVX512VLVNNI-NEXT:    vmovd %xmm2, %eax
 | |
| ; AVX512VLVNNI-NEXT:    addl %edx, %eax
 | |
| ; AVX512VLVNNI-NEXT:    retq
 | |
| entry:
 | |
|   %0 = bitcast i8* %a to <4 x i8>*
 | |
|   %1 = load <4 x i8>, <4 x i8>* %0, align 8
 | |
|   %2 = zext <4 x i8> %1 to <4 x i32>
 | |
|   %3 = bitcast i8* %b to <4 x i8>*
 | |
|   %4 = load <4 x i8>, <4 x i8>* %3, align 8
 | |
|   %5 = sext <4 x i8> %4 to <4 x i32>
 | |
|   %6 = mul nsw <4 x i32> %5, %2
 | |
|   %7 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %6)
 | |
|   %op.extra = add nsw i32 %7, %c
 | |
|   ret i32 %op.extra
 | |
| }
 | |
| 
 | |
| declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 | |
| 
 | |
| define i32 @vpdpbusd_2xi32(i8 *%a, i8 *%b, i32 %c, i32 %n) {
 | |
| ; AVXVNNI-LABEL: vpdpbusd_2xi32:
 | |
| ; AVXVNNI:       # %bb.0: # %entry
 | |
| ; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 | |
| ; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 | |
| ; AVXVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | |
| ; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
 | |
| ; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
 | |
| ; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm1, %xmm0, %xmm2
 | |
| ; AVXVNNI-NEXT:    vmovd %xmm2, %eax
 | |
| ; AVXVNNI-NEXT:    addl %edx, %eax
 | |
| ; AVXVNNI-NEXT:    retq
 | |
| ;
 | |
| ; AVX512VNNI-LABEL: vpdpbusd_2xi32:
 | |
| ; AVX512VNNI:       # %bb.0: # %entry
 | |
| ; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 | |
| ; AVX512VNNI-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
 | |
| ; AVX512VNNI-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 | |
| ; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 | |
| ; AVX512VNNI-NEXT:    vpandq %zmm1, %zmm2, %zmm1
 | |
| ; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | |
| ; AVX512VNNI-NEXT:    vpdpbusd %zmm0, %zmm1, %zmm2
 | |
| ; AVX512VNNI-NEXT:    vmovd %xmm2, %eax
 | |
| ; AVX512VNNI-NEXT:    addl %edx, %eax
 | |
| ; AVX512VNNI-NEXT:    vzeroupper
 | |
| ; AVX512VNNI-NEXT:    retq
 | |
| ;
 | |
| ; AVX512VLVNNI-LABEL: vpdpbusd_2xi32:
 | |
| ; AVX512VLVNNI:       # %bb.0: # %entry
 | |
| ; AVX512VLVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 | |
| ; AVX512VLVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 | |
| ; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | |
| ; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
 | |
| ; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
 | |
| ; AVX512VLVNNI-NEXT:    vpdpbusd %xmm1, %xmm0, %xmm2
 | |
| ; AVX512VLVNNI-NEXT:    vmovd %xmm2, %eax
 | |
| ; AVX512VLVNNI-NEXT:    addl %edx, %eax
 | |
| ; AVX512VLVNNI-NEXT:    retq
 | |
| entry:
 | |
|   %0 = bitcast i8* %a to <2 x i8>*
 | |
|   %1 = load <2 x i8>, <2 x i8>* %0, align 8
 | |
|   %2 = zext <2 x i8> %1 to <2 x i32>
 | |
|   %3 = bitcast i8* %b to <2 x i8>*
 | |
|   %4 = load <2 x i8>, <2 x i8>* %3, align 8
 | |
|   %5 = sext <2 x i8> %4 to <2 x i32>
 | |
|   %6 = mul nsw <2 x i32> %5, %2
 | |
|   %7 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %6)
 | |
|   %op.extra = add nsw i32 %7, %c
 | |
|   ret i32 %op.extra
 | |
| }
 | |
| 
 | |
| declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
 | |
| 
 | |
| define i32 @vpdpbusd_32xi32(i8 *%a, i8 *%b, i32 %c, i32 %n) {
 | |
| ; AVXVNNI-LABEL: vpdpbusd_32xi32:
 | |
| ; AVXVNNI:       # %bb.0: # %entry
 | |
| ; AVXVNNI-NEXT:    vmovdqu (%rdi), %ymm0
 | |
| ; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | |
| ; AVXVNNI-NEXT:    {vex} vpdpbusd (%rsi), %ymm0, %ymm1
 | |
| ; AVXVNNI-NEXT:    vextracti128 $1, %ymm1, %xmm0
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 | |
| ; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVXVNNI-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVXVNNI-NEXT:    addl %edx, %eax
 | |
| ; AVXVNNI-NEXT:    vzeroupper
 | |
| ; AVXVNNI-NEXT:    retq
 | |
| ;
 | |
| ; AVX512VNNI-LABEL: vpdpbusd_32xi32:
 | |
| ; AVX512VNNI:       # %bb.0: # %entry
 | |
| ; AVX512VNNI-NEXT:    vmovdqu (%rdi), %ymm0
 | |
| ; AVX512VNNI-NEXT:    vmovdqu (%rsi), %ymm1
 | |
| ; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | |
| ; AVX512VNNI-NEXT:    vpdpbusd %zmm1, %zmm0, %zmm2
 | |
| ; AVX512VNNI-NEXT:    vextracti128 $1, %ymm2, %xmm0
 | |
| ; AVX512VNNI-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
 | |
| ; AVX512VNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 | |
| ; AVX512VNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512VNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 | |
| ; AVX512VNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512VNNI-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVX512VNNI-NEXT:    addl %edx, %eax
 | |
| ; AVX512VNNI-NEXT:    vzeroupper
 | |
| ; AVX512VNNI-NEXT:    retq
 | |
| ;
 | |
| ; AVX512VLVNNI-LABEL: vpdpbusd_32xi32:
 | |
| ; AVX512VLVNNI:       # %bb.0: # %entry
 | |
| ; AVX512VLVNNI-NEXT:    vmovdqu (%rdi), %ymm0
 | |
| ; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | |
| ; AVX512VLVNNI-NEXT:    vpdpbusd (%rsi), %ymm0, %ymm1
 | |
| ; AVX512VLVNNI-NEXT:    vextracti128 $1, %ymm1, %xmm0
 | |
| ; AVX512VLVNNI-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 | |
| ; AVX512VLVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 | |
| ; AVX512VLVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512VLVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 | |
| ; AVX512VLVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512VLVNNI-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVX512VLVNNI-NEXT:    addl %edx, %eax
 | |
| ; AVX512VLVNNI-NEXT:    vzeroupper
 | |
| ; AVX512VLVNNI-NEXT:    retq
 | |
| entry:
 | |
|   %0 = bitcast i8* %a to <32 x i8>*
 | |
|   %1 = load <32 x i8>, <32 x i8>* %0, align 16
 | |
|   %2 = zext <32 x i8> %1 to <32 x i32>
 | |
|   %3 = bitcast i8* %b to <32 x i8>*
 | |
|   %4 = load <32 x i8>, <32 x i8>* %3, align 16
 | |
|   %5 = sext <32 x i8> %4 to <32 x i32>
 | |
|   %6 = mul nsw <32 x i32> %5, %2
 | |
|   %7 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %6)
 | |
|   %op.extra = add nsw i32 %7, %c
 | |
|   ret i32 %op.extra
 | |
| }
 | |
| 
 | |
| declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
 | |
| 
 | |
| define i32 @vpdpbusd_64xi32(i8 *%a, i8 *%b, i32 %c, i32 %n) {
 | |
| ; AVXVNNI-LABEL: vpdpbusd_64xi32:
 | |
| ; AVXVNNI:       # %bb.0: # %entry
 | |
| ; AVXVNNI-NEXT:    vmovdqu (%rdi), %ymm0
 | |
| ; AVXVNNI-NEXT:    vmovdqu 32(%rdi), %ymm1
 | |
| ; AVXVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | |
| ; AVXVNNI-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 | |
| ; AVXVNNI-NEXT:    {vex} vpdpbusd 32(%rsi), %ymm1, %ymm3
 | |
| ; AVXVNNI-NEXT:    {vex} vpdpbusd (%rsi), %ymm0, %ymm2
 | |
| ; AVXVNNI-NEXT:    vpaddd %ymm3, %ymm2, %ymm0
 | |
| ; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 | |
| ; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVXVNNI-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVXVNNI-NEXT:    addl %edx, %eax
 | |
| ; AVXVNNI-NEXT:    vzeroupper
 | |
| ; AVXVNNI-NEXT:    retq
 | |
| ;
 | |
| ; AVX512-LABEL: vpdpbusd_64xi32:
 | |
| ; AVX512:       # %bb.0: # %entry
 | |
| ; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
 | |
| ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | |
| ; AVX512-NEXT:    vpdpbusd (%rsi), %zmm0, %zmm1
 | |
| ; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
 | |
| ; AVX512-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 | |
| ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 | |
| ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 | |
| ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 | |
| ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 | |
| ; AVX512-NEXT:    vmovd %xmm0, %eax
 | |
| ; AVX512-NEXT:    addl %edx, %eax
 | |
| ; AVX512-NEXT:    vzeroupper
 | |
| ; AVX512-NEXT:    retq
 | |
| entry:
 | |
|   %0 = bitcast i8* %a to <64 x i8>*
 | |
|   %1 = load <64 x i8>, <64 x i8>* %0, align 16
 | |
|   %2 = zext <64 x i8> %1 to <64 x i32>
 | |
|   %3 = bitcast i8* %b to <64 x i8>*
 | |
|   %4 = load <64 x i8>, <64 x i8>* %3, align 16
 | |
|   %5 = sext <64 x i8> %4 to <64 x i32>
 | |
|   %6 = mul nsw <64 x i32> %5, %2
 | |
|   %7 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %6)
 | |
|   %op.extra = add nsw i32 %7, %c
 | |
|   ret i32 %op.extra
 | |
| }
 | |
| 
 | |
| declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
 |