diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll index df9be9a35e2e..6e955d620f5c 100644 --- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll +++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64 ; Check that under certain conditions we can factor out a rotate ; from the following idioms: @@ -13,9 +13,7 @@ define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) { ; CHECK-LABEL: vroll_v4i32_extract_shl: ; CHECK: # %bb.0: ; CHECK-NEXT: vpslld $3, %xmm0, %xmm0 -; CHECK-NEXT: vprold $7, %zmm0, %zmm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vprold $7, %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %lhs_mul = shl <4 x i32> %i, %rhs_mul = shl <4 x i32> %i, @@ -28,8 +26,7 @@ define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind { ; CHECK-LABEL: vrolq_v4i64_extract_shrl: ; CHECK: # %bb.0: ; CHECK-NEXT: vpsrlq $5, %ymm0, %ymm0 -; CHECK-NEXT: vprolq $29, %zmm0, %zmm0 -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-NEXT: vprolq $29, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %lhs_div = lshr <4 x i64> %i, %rhs_div = lshr <4 x i64> %i, @@ -39,13 +36,17 @@ define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind { } define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind { -; CHECK-LABEL: vroll_extract_mul: -; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10] -; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vprold $6, %zmm0, %zmm0 -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: vroll_extract_mul: +; X86: # %bb.0: +; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 +; X86-NEXT: vprold $6, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: vroll_extract_mul: +; X64: # %bb.0: +; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; X64-NEXT: vprold $6, %ymm0, %ymm0 +; X64-NEXT: retq %lhs_mul = mul <8 x i32> %i, %rhs_mul = mul <8 x i32> %i, %rhs_shift = lshr <8 x i32> %rhs_mul, @@ -75,10 +76,8 @@ define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind { ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 -; X86-NEXT: vprolq $57, %zmm0, %zmm0 -; X86-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; X86-NEXT: vprolq $57, %xmm0, %xmm0 ; X86-NEXT: addl $32, %esp -; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: vrolq_extract_udiv: @@ -92,9 +91,7 @@ define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind { ; X64-NEXT: vmovq %rdx, %xmm0 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: vpsrlq $1, %xmm0, %xmm0 -; X64-NEXT: vprolq $57, %zmm0, %zmm0 -; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; X64-NEXT: vzeroupper +; X64-NEXT: vprolq $57, %xmm0, %xmm0 ; X64-NEXT: retq %lhs_div = udiv <2 x i64> %i, %rhs_div = udiv <2 x i64> %i, @@ -106,20 +103,16 @@ define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind { define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind { ; X86-LABEL: vrolw_extract_mul_with_mask: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9] -; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X86-NEXT: vprold $7, %zmm0, %zmm0 +; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 +; X86-NEXT: vprold $7, %xmm0, %xmm0 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: vrolw_extract_mul_with_mask: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9] -; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X64-NEXT: vprold $7, %zmm0, %zmm0 +; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-NEXT: vprold $7, %xmm0, %xmm0 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vzeroupper ; X64-NEXT: retq %lhs_mul = mul <4 x i32> %i, %rhs_mul = mul <4 x i32> %i, @@ -170,14 +163,19 @@ define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind { ; Result would overshift define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind { -; CHECK-LABEL: no_extract_shrl: -; CHECK: # %bb.0: -; CHECK-NEXT: vpslld $25, %xmm0, %xmm1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4026531840,4026531840,4026531840,4026531840] -; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpsrld $9, %xmm0, %xmm0 -; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: no_extract_shrl: +; X86: # %bb.0: +; X86-NEXT: vpsrld $9, %xmm0, %xmm1 +; X86-NEXT: vpslld $25, %xmm0, %xmm0 +; X86-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm1, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: no_extract_shrl: +; X64: # %bb.0: +; X64-NEXT: vpsrld $9, %xmm0, %xmm1 +; X64-NEXT: vpslld $25, %xmm0, %xmm0 +; X64-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; X64-NEXT: retq %lhs_div = lshr <4 x i32> %i, %rhs_div = lshr <4 x i32> %i, %lhs_shift = shl <4 x i32> %lhs_div, @@ -187,15 +185,21 @@ define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind { ; Can factor 512 from 1536, but result is 3 instead of 9 define <8 x i32> @no_extract_mul(<8 x i32> %i) nounwind { -; CHECK-LABEL: no_extract_mul: -; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1536,1536,1536,1536,1536,1536,1536,1536] -; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9] -; CHECK-NEXT: vpmulld %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpsrld $23, %ymm0, %ymm0 -; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: no_extract_mul: +; X86: # %bb.0: +; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm1 +; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 +; X86-NEXT: vpsrld $23, %ymm0, %ymm0 +; X86-NEXT: vpor %ymm0, %ymm1, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: no_extract_mul: +; X64: # %bb.0: +; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 +; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; X64-NEXT: vpsrld $23, %ymm0, %ymm0 +; X64-NEXT: vpor %ymm0, %ymm1, %ymm0 +; X64-NEXT: retq %lhs_mul = mul <8 x i32> %i, %rhs_mul = mul <8 x i32> %i, %rhs_shift = lshr <8 x i32> %rhs_mul, @@ -243,8 +247,7 @@ define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind { ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 -; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload -; X86-NEXT: vpsllq $56, %xmm1, %xmm1 +; X86-NEXT: vpsllq $56, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X86-NEXT: addl $48, %esp ; X86-NEXT: retl @@ -285,10 +288,7 @@ define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind { define <4 x i32> @extract_add_1(<4 x i32> %i) nounwind { ; CHECK-LABEL: extract_add_1: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vprold $1, %zmm0, %zmm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vprold $1, %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %ii = add <4 x i32> %i, %i %rhs = lshr <4 x i32> %i, @@ -299,10 +299,7 @@ define <4 x i32> @extract_add_1(<4 x i32> %i) nounwind { define <4 x i32> @extract_add_1_comut(<4 x i32> %i) nounwind { ; CHECK-LABEL: extract_add_1_comut: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vprold $1, %zmm0, %zmm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vprold $1, %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %ii = add <4 x i32> %i, %i %lhs = lshr <4 x i32> %i,