From 890b415902b1987b51a6b1e26fedd8a9a8d136ff Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 9 Jul 2016 20:55:20 +0000 Subject: [PATCH] [X86][SSE] Regenerate vector shift tests llvm-svn: 274987 --- llvm/test/CodeGen/X86/vec_shift.ll | 41 ++++- llvm/test/CodeGen/X86/vec_shift2.ll | 34 ++++- llvm/test/CodeGen/X86/vec_shift3.ll | 41 ++++- llvm/test/CodeGen/X86/vec_shift4.ll | 64 +++++++- llvm/test/CodeGen/X86/vec_shift5.ll | 217 +++++++++++++++++++-------- llvm/test/CodeGen/X86/vec_shift6.ll | 225 ++++++++++++++++++++-------- llvm/test/CodeGen/X86/vec_shift7.ll | 23 ++- 7 files changed, 496 insertions(+), 149 deletions(-) diff --git a/llvm/test/CodeGen/X86/vec_shift.ll b/llvm/test/CodeGen/X86/vec_shift.ll index ddf0469b72a7..55b55936634d 100644 --- a/llvm/test/CodeGen/X86/vec_shift.ll +++ b/llvm/test/CodeGen/X86/vec_shift.ll @@ -1,8 +1,17 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep psllw -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep psrlq -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep psraw +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 define <2 x i64> @t1(<2 x i64> %b1, <2 x i64> %c) nounwind { +; X32-LABEL: t1: +; X32: # BB#0: # %entry +; X32-NEXT: psllw %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: t1: +; X64: # BB#0: # %entry +; X64-NEXT: psllw %xmm1, %xmm0 +; X64-NEXT: retq entry: %tmp6 = bitcast <2 x i64> %c to <8 x i16> ; <<8 x i16>> [#uses=1] %tmp8 = bitcast <2 x i64> %b1 to <8 x i16> ; <<8 x i16>> [#uses=1] @@ -12,6 +21,17 @@ entry: } define <2 x i64> @t3(<2 x i64> %b1, i32 %c) nounwind { +; X32-LABEL: t3: +; X32: # BB#0: # %entry +; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: psraw %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: t3: +; X64: # BB#0: # %entry +; X64-NEXT: movd %edi, %xmm1 +; X64-NEXT: psraw %xmm1, %xmm0 +; X64-NEXT: retq entry: %tmp2 = bitcast <2 x i64> %b1 to <8 x i16> ; <<8 x i16>> [#uses=1] %tmp4 = insertelement <4 x i32> undef, i32 %c, i32 0 ; <<4 x i32>> [#uses=1] @@ -21,14 +41,23 @@ entry: ret <2 x i64> %tmp11 } -declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone +declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x i64> @t2(<2 x i64> %b1, <2 x i64> %c) nounwind { +; X32-LABEL: t2: +; X32: # BB#0: # %entry +; X32-NEXT: psrlq %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: t2: +; X64: # BB#0: # %entry +; X64-NEXT: psrlq %xmm1, %xmm0 +; X64-NEXT: retq entry: %tmp9 = tail call <2 x i64> @llvm.x86.sse2.psrl.q( <2 x i64> %b1, <2 x i64> %c ) nounwind readnone ; <<2 x i64>> [#uses=1] ret <2 x i64> %tmp9 } -declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone +declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone -declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone +declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone diff --git a/llvm/test/CodeGen/X86/vec_shift2.ll b/llvm/test/CodeGen/X86/vec_shift2.ll index c5f9dc4ace32..21d599fead08 100644 --- a/llvm/test/CodeGen/X86/vec_shift2.ll +++ b/llvm/test/CodeGen/X86/vec_shift2.ll @@ -1,6 +1,21 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep CPI +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 define <2 x i64> @t1(<2 x i64> %b1, <2 x i64> %c) nounwind { +; X32-LABEL: t1: +; X32: # BB#0: +; X32-NEXT: movl $14, %eax +; X32-NEXT: movd %eax, %xmm1 +; X32-NEXT: psrlw %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: t1: +; X64: # BB#0: +; X64-NEXT: movl $14, %eax +; X64-NEXT: movd %eax, %xmm1 +; X64-NEXT: psrlw %xmm1, %xmm0 +; X64-NEXT: retq %tmp1 = bitcast <2 x i64> %b1 to <8 x i16> %tmp2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w( <8 x i16> %tmp1, <8 x i16> bitcast (<4 x i32> < i32 14, i32 undef, i32 undef, i32 undef > to <8 x i16>) ) nounwind readnone %tmp3 = bitcast <8 x i16> %tmp2 to <2 x i64> @@ -8,10 +23,23 @@ define <2 x i64> @t1(<2 x i64> %b1, <2 x i64> %c) nounwind { } define <4 x i32> @t2(<2 x i64> %b1, <2 x i64> %c) nounwind { +; X32-LABEL: t2: +; X32: # BB#0: +; X32-NEXT: movl $14, %eax +; X32-NEXT: movd %eax, %xmm1 +; X32-NEXT: pslld %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: t2: +; X64: # BB#0: +; X64-NEXT: movl $14, %eax +; X64-NEXT: movd %eax, %xmm1 +; X64-NEXT: pslld %xmm1, %xmm0 +; X64-NEXT: retq %tmp1 = bitcast <2 x i64> %b1 to <4 x i32> %tmp2 = tail call <4 x i32> @llvm.x86.sse2.psll.d( <4 x i32> %tmp1, <4 x i32> < i32 14, i32 undef, i32 undef, i32 undef > ) nounwind readnone ret <4 x i32> %tmp2 } -declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone +declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone diff --git a/llvm/test/CodeGen/X86/vec_shift3.ll b/llvm/test/CodeGen/X86/vec_shift3.ll index 1ebf455c0555..071f0d38b96d 100644 --- a/llvm/test/CodeGen/X86/vec_shift3.ll +++ b/llvm/test/CodeGen/X86/vec_shift3.ll @@ -1,20 +1,51 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep psllq -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep psraw -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movd | count 2 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 define <2 x i64> @t1(<2 x i64> %x1, i32 %bits) nounwind { +; X32-LABEL: t1: +; X32: # BB#0: # %entry +; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: psllq %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: t1: +; X64: # BB#0: # %entry +; X64-NEXT: movd %edi, %xmm1 +; X64-NEXT: psllq %xmm1, %xmm0 +; X64-NEXT: retq entry: %tmp3 = tail call <2 x i64> @llvm.x86.sse2.pslli.q( <2 x i64> %x1, i32 %bits ) nounwind readnone ; <<2 x i64>> [#uses=1] ret <2 x i64> %tmp3 } define <2 x i64> @t2(<2 x i64> %x1) nounwind { +; X32-LABEL: t2: +; X32: # BB#0: # %entry +; X32-NEXT: psllq $10, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: t2: +; X64: # BB#0: # %entry +; X64-NEXT: psllq $10, %xmm0 +; X64-NEXT: retq entry: %tmp3 = tail call <2 x i64> @llvm.x86.sse2.pslli.q( <2 x i64> %x1, i32 10 ) nounwind readnone ; <<2 x i64>> [#uses=1] ret <2 x i64> %tmp3 } define <2 x i64> @t3(<2 x i64> %x1, i32 %bits) nounwind { +; X32-LABEL: t3: +; X32: # BB#0: # %entry +; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: psraw %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: t3: +; X64: # BB#0: # %entry +; X64-NEXT: movd %edi, %xmm1 +; X64-NEXT: psraw %xmm1, %xmm0 +; X64-NEXT: retq entry: %tmp2 = bitcast <2 x i64> %x1 to <8 x i16> ; <<8 x i16>> [#uses=1] %tmp4 = tail call <8 x i16> @llvm.x86.sse2.psrai.w( <8 x i16> %tmp2, i32 %bits ) nounwind readnone ; <<8 x i16>> [#uses=1] @@ -22,5 +53,5 @@ entry: ret <2 x i64> %tmp5 } -declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone -declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone +declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone +declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone diff --git a/llvm/test/CodeGen/X86/vec_shift4.ll b/llvm/test/CodeGen/X86/vec_shift4.ll index b266a6987557..66229361990f 100644 --- a/llvm/test/CodeGen/X86/vec_shift4.ll +++ b/llvm/test/CodeGen/X86/vec_shift4.ll @@ -1,6 +1,23 @@ -; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64 define <2 x i64> @shl1(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp { +; X32-LABEL: shl1: +; X32: # BB#0: # %entry +; X32-NEXT: pslld $23, %xmm1 +; X32-NEXT: paddd {{\.LCPI.*}}, %xmm1 +; X32-NEXT: cvttps2dq %xmm1, %xmm1 +; X32-NEXT: pmulld %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: shl1: +; X64: # BB#0: # %entry +; X64-NEXT: pslld $23, %xmm1 +; X64-NEXT: paddd {{.*}}(%rip), %xmm1 +; X64-NEXT: cvttps2dq %xmm1, %xmm1 +; X64-NEXT: pmulld %xmm1, %xmm0 +; X64-NEXT: retq entry: ; CHECK-NOT: shll ; CHECK: pslld @@ -14,6 +31,51 @@ entry: } define <2 x i64> @shl2(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp { +; X32-LABEL: shl2: +; X32: # BB#0: # %entry +; X32-NEXT: movdqa %xmm0, %xmm2 +; X32-NEXT: psllw $5, %xmm1 +; X32-NEXT: movdqa %xmm2, %xmm3 +; X32-NEXT: psllw $4, %xmm3 +; X32-NEXT: pand {{\.LCPI.*}}, %xmm3 +; X32-NEXT: movdqa %xmm1, %xmm0 +; X32-NEXT: pblendvb %xmm3, %xmm2 +; X32-NEXT: movdqa %xmm2, %xmm3 +; X32-NEXT: psllw $2, %xmm3 +; X32-NEXT: pand {{\.LCPI.*}}, %xmm3 +; X32-NEXT: paddb %xmm1, %xmm1 +; X32-NEXT: movdqa %xmm1, %xmm0 +; X32-NEXT: pblendvb %xmm3, %xmm2 +; X32-NEXT: movdqa %xmm2, %xmm3 +; X32-NEXT: paddb %xmm3, %xmm3 +; X32-NEXT: paddb %xmm1, %xmm1 +; X32-NEXT: movdqa %xmm1, %xmm0 +; X32-NEXT: pblendvb %xmm3, %xmm2 +; X32-NEXT: movdqa %xmm2, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: shl2: +; X64: # BB#0: # %entry +; X64-NEXT: movdqa %xmm0, %xmm2 +; X64-NEXT: psllw $5, %xmm1 +; X64-NEXT: movdqa %xmm2, %xmm3 +; X64-NEXT: psllw $4, %xmm3 +; X64-NEXT: pand {{.*}}(%rip), %xmm3 +; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: pblendvb %xmm3, %xmm2 +; X64-NEXT: movdqa %xmm2, %xmm3 +; X64-NEXT: psllw $2, %xmm3 +; X64-NEXT: pand {{.*}}(%rip), %xmm3 +; X64-NEXT: paddb %xmm1, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: pblendvb %xmm3, %xmm2 +; X64-NEXT: movdqa %xmm2, %xmm3 +; X64-NEXT: paddb %xmm3, %xmm3 +; X64-NEXT: paddb %xmm1, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: pblendvb %xmm3, %xmm2 +; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: retq entry: ; CHECK-NOT: shlb ; CHECK: pblendvb diff --git a/llvm/test/CodeGen/X86/vec_shift5.ll b/llvm/test/CodeGen/X86/vec_shift5.ll index 499aa22de52d..cba2b5d05041 100644 --- a/llvm/test/CodeGen/X86/vec_shift5.ll +++ b/llvm/test/CodeGen/X86/vec_shift5.ll @@ -1,153 +1,238 @@ -; RUN: llc -march=x86-64 -mattr=+sse2 < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 ; Verify that we correctly fold target specific packed vector shifts by ; immediate count into a simple build_vector when the elements of the vector ; in input to the packed shift are all constants or undef. define <8 x i16> @test1() { +; X32-LABEL: test1: +; X32: # BB#0: +; X32-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64,8,16,32,64] +; X32-NEXT: retl +; +; X64-LABEL: test1: +; X64: # BB#0: +; X64-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64,8,16,32,64] +; X64-NEXT: retq %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> , i32 3) ret <8 x i16> %1 } -; CHECK-LABEL: test1 -; CHECK-NOT: psll -; CHECK: movaps -; CHECK-NEXT: ret define <8 x i16> @test2() { +; X32-LABEL: test2: +; X32: # BB#0: +; X32-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4] +; X32-NEXT: retl +; +; X64-LABEL: test2: +; X64: # BB#0: +; X64-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4] +; X64-NEXT: retq %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> , i32 3) ret <8 x i16> %1 } -; CHECK-LABEL: test2 -; CHECK-NOT: psrl -; CHECK: movaps -; CHECK-NEXT: ret define <8 x i16> @test3() { +; X32-LABEL: test3: +; X32: # BB#0: +; X32-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4] +; X32-NEXT: retl +; +; X64-LABEL: test3: +; X64: # BB#0: +; X64-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4] +; X64-NEXT: retq %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> , i32 3) ret <8 x i16> %1 } -; CHECK-LABEL: test3 -; CHECK-NOT: psra -; CHECK: movaps -; CHECK-NEXT: ret define <4 x i32> @test4() { +; X32-LABEL: test4: +; X32: # BB#0: +; X32-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64] +; X32-NEXT: retl +; +; X64-LABEL: test4: +; X64: # BB#0: +; X64-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64] +; X64-NEXT: retq %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> , i32 3) ret <4 x i32> %1 } -; CHECK-LABEL: test4 -; CHECK-NOT: psll -; CHECK: movaps -; CHECK-NEXT: ret define <4 x i32> @test5() { +; X32-LABEL: test5: +; X32: # BB#0: +; X32-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4] +; X32-NEXT: retl +; +; X64-LABEL: test5: +; X64: # BB#0: +; X64-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4] +; X64-NEXT: retq %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> , i32 3) ret <4 x i32> %1 } -; CHECK-LABEL: test5 -; CHECK-NOT: psrl -; CHECK: movaps -; CHECK-NEXT: ret define <4 x i32> @test6() { +; X32-LABEL: test6: +; X32: # BB#0: +; X32-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4] +; X32-NEXT: retl +; +; X64-LABEL: test6: +; X64: # BB#0: +; X64-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4] +; X64-NEXT: retq %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> , i32 3) ret <4 x i32> %1 } -; CHECK-LABEL: test6 -; CHECK-NOT: psra -; CHECK: movaps -; CHECK-NEXT: ret define <2 x i64> @test7() { +; X32-LABEL: test7: +; X32: # BB#0: +; X32-NEXT: movdqa {{.*#+}} xmm0 = [1,0,2,0] +; X32-NEXT: psllq $3, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test7: +; X64: # BB#0: +; X64-NEXT: movaps {{.*#+}} xmm0 = [8,16] +; X64-NEXT: retq %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> , i32 3) ret <2 x i64> %1 } -; CHECK-LABEL: test7 -; CHECK-NOT: psll -; CHECK: movaps -; CHECK-NEXT: ret define <2 x i64> @test8() { +; X32-LABEL: test8: +; X32: # BB#0: +; X32-NEXT: movdqa {{.*#+}} xmm0 = [8,0,16,0] +; X32-NEXT: psrlq $3, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test8: +; X64: # BB#0: +; X64-NEXT: movaps {{.*#+}} xmm0 = [1,2] +; X64-NEXT: retq %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> , i32 3) ret <2 x i64> %1 } -; CHECK-LABEL: test8 -; CHECK-NOT: psrl -; CHECK: movaps -; CHECK-NEXT: ret define <8 x i16> @test9() { +; X32-LABEL: test9: +; X32: # BB#0: +; X32-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16> +; X32-NEXT: retl +; +; X64-LABEL: test9: +; X64: # BB#0: +; X64-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16> +; X64-NEXT: retq %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> , i32 3) ret <8 x i16> %1 } -; CHECK-LABEL: test9 -; CHECK-NOT: psra -; CHECK: movaps -; CHECK-NEXT: ret define <4 x i32> @test10() { +; X32-LABEL: test10: +; X32: # BB#0: +; X32-NEXT: movaps {{.*#+}} xmm0 = +; X32-NEXT: retl +; +; X64-LABEL: test10: +; X64: # BB#0: +; X64-NEXT: movaps {{.*#+}} xmm0 = +; X64-NEXT: retq %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> , i32 3) ret <4 x i32> %1 } -; CHECK-LABEL: test10 -; CHECK-NOT: psra -; CHECK: movaps -; CHECK-NEXT: ret define <2 x i64> @test11() { +; X32-LABEL: test11: +; X32: # BB#0: +; X32-NEXT: movdqa {{.*#+}} xmm0 = +; X32-NEXT: psrlq $3, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test11: +; X64: # BB#0: +; X64-NEXT: movaps {{.*#+}} xmm0 = +; X64-NEXT: retq %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> , i32 3) ret <2 x i64> %1 } -; CHECK-LABEL: test11 -; CHECK-NOT: psrl -; CHECK: movaps -; CHECK-NEXT: ret define <8 x i16> @test12() { +; X32-LABEL: test12: +; X32: # BB#0: +; X32-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16> +; X32-NEXT: retl +; +; X64-LABEL: test12: +; X64: # BB#0: +; X64-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16> +; X64-NEXT: retq %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> , i32 3) ret <8 x i16> %1 } -; CHECK-LABEL: test12 -; CHECK-NOT: psra -; CHECK: movaps -; CHECK-NEXT: ret define <4 x i32> @test13() { +; X32-LABEL: test13: +; X32: # BB#0: +; X32-NEXT: movaps {{.*#+}} xmm0 = +; X32-NEXT: retl +; +; X64-LABEL: test13: +; X64: # BB#0: +; X64-NEXT: movaps {{.*#+}} xmm0 = +; X64-NEXT: retq %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> , i32 3) ret <4 x i32> %1 } -; CHECK-LABEL: test13 -; CHECK-NOT: psrl -; CHECK: movaps -; CHECK-NEXT: ret define <8 x i16> @test14() { +; X32-LABEL: test14: +; X32: # BB#0: +; X32-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16> +; X32-NEXT: retl +; +; X64-LABEL: test14: +; X64: # BB#0: +; X64-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16> +; X64-NEXT: retq %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> , i32 3) ret <8 x i16> %1 } -; CHECK-LABEL: test14 -; CHECK-NOT: psrl -; CHECK: movaps -; CHECK-NEXT: ret define <4 x i32> @test15() { +; X32-LABEL: test15: +; X32: # BB#0: +; X32-NEXT: movaps {{.*#+}} xmm0 = +; X32-NEXT: retl +; +; X64-LABEL: test15: +; X64: # BB#0: +; X64-NEXT: movaps {{.*#+}} xmm0 = +; X64-NEXT: retq %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> , i32 3) ret <4 x i32> %1 } -; CHECK-LABEL: test15 -; CHECK-NOT: psll -; CHECK: movaps -; CHECK-NEXT: ret define <2 x i64> @test16() { +; X32-LABEL: test16: +; X32: # BB#0: +; X32-NEXT: movdqa {{.*#+}} xmm0 = +; X32-NEXT: psllq $3, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test16: +; X64: # BB#0: +; X64-NEXT: movaps {{.*#+}} xmm0 = +; X64-NEXT: retq %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> , i32 3) ret <2 x i64> %1 } -; CHECK-LABEL: test16 -; CHECK-NOT: psll -; CHECK: movaps -; CHECK-NEXT: ret - declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) diff --git a/llvm/test/CodeGen/X86/vec_shift6.ll b/llvm/test/CodeGen/X86/vec_shift6.ll index b71f9893a9db..c4b7f204be69 100644 --- a/llvm/test/CodeGen/X86/vec_shift6.ll +++ b/llvm/test/CodeGen/X86/vec_shift6.ll @@ -1,134 +1,229 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -check-prefix=AVX2ONLY -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -check-prefix=AVX512 - +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefix=AVX512 ; Verify that we don't scalarize a packed vector shift left of 16-bit ; signed integers if the amount is a constant build_vector. ; Check that we produce a SSE2 packed integer multiply (pmullw) instead. define <8 x i16> @test1(<8 x i16> %a) { +; SSE-LABEL: test1: +; SSE: # BB#0: +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX2-LABEL: test1: +; AVX2: # BB#0: +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test1: +; AVX512: # BB#0: +; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq %shl = shl <8 x i16> %a, ret <8 x i16> %shl } -; CHECK-LABEL: test1 -; CHECK: pmullw -; CHECK-NEXT: ret - define <8 x i16> @test2(<8 x i16> %a) { +; SSE-LABEL: test2: +; SSE: # BB#0: +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX2-LABEL: test2: +; AVX2: # BB#0: +; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test2: +; AVX512: # BB#0: +; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq %shl = shl <8 x i16> %a, ret <8 x i16> %shl } -; CHECK-LABEL: test2 -; CHECK: pmullw -; CHECK-NEXT: ret - ; Verify that a vector shift left of 32-bit signed integers is simply expanded ; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift ; counts is a constant build_vector. define <4 x i32> @test3(<4 x i32> %a) { +; SSE-LABEL: test3: +; SSE: # BB#0: +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX2-LABEL: test3: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test3: +; AVX512: # BB#0: +; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq %shl = shl <4 x i32> %a, ret <4 x i32> %shl } -; CHECK-LABEL: test3 -; CHECK-NOT: cvttps2dq -; SSE: pmulld -; AVX2: vpsllvd -; CHECK-NEXT: ret - define <4 x i32> @test4(<4 x i32> %a) { +; SSE-LABEL: test4: +; SSE: # BB#0: +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX2-LABEL: test4: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test4: +; AVX512: # BB#0: +; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: retq %shl = shl <4 x i32> %a, ret <4 x i32> %shl } -; CHECK-LABEL: test4 -; CHECK-NOT: cvttps2dq -; SSE: pmulld -; AVX2: vpsllvd -; CHECK-NEXT: ret - ; If we have AVX/SSE2 but not AVX2, verify that the following shift is split ; into two pmullw instructions. With AVX2, the test case below would produce ; a single vpmullw. define <16 x i16> @test5(<16 x i16> %a) { +; SSE-LABEL: test5: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048] +; SSE-NEXT: pmullw %xmm2, %xmm0 +; SSE-NEXT: pmullw %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX2-LABEL: test5: +; AVX2: # BB#0: +; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test5: +; AVX512: # BB#0: +; AVX512-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: retq %shl = shl <16 x i16> %a, ret <16 x i16> %shl } -; CHECK-LABEL: test5 -; SSE: pmullw -; SSE-NEXT: pmullw -; AVX2: vpmullw -; AVX2-NOT: vpmullw -; CHECK: ret - ; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split ; into two pmulld instructions. With AVX2, the test case below would produce ; a single vpsllvd instead. define <8 x i32> @test6(<8 x i32> %a) { +; SSE-LABEL: test6: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8] +; SSE-NEXT: pmulld %xmm2, %xmm0 +; SSE-NEXT: pmulld %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX2-LABEL: test6: +; AVX2: # BB#0: +; AVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test6: +; AVX512: # BB#0: +; AVX512-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: retq %shl = shl <8 x i32> %a, ret <8 x i32> %shl } -; CHECK-LABEL: test6 -; SSE: pmulld -; SSE-NEXT: pmulld -; AVX2: vpsllvd -; CHECK: ret - ; With AVX2 and AVX512, the test case below should produce a sequence of ; two vpmullw instructions. On SSE2 instead, we split the shift in four ; parts and then we convert each part into a pmullw. define <32 x i16> @test7(<32 x i16> %a) { +; SSE-LABEL: test7: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048] +; SSE-NEXT: pmullw %xmm4, %xmm0 +; SSE-NEXT: pmullw %xmm4, %xmm1 +; SSE-NEXT: pmullw %xmm4, %xmm2 +; SSE-NEXT: pmullw %xmm4, %xmm3 +; SSE-NEXT: retq +; +; AVX2-LABEL: test7: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] +; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test7: +; AVX512: # BB#0: +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] +; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: retq %shl = shl <32 x i16> %a, ret <32 x i16> %shl } -; CHECK-LABEL: test7 -; SSE: pmullw -; SSE-NEXT: pmullw -; SSE-NEXT: pmullw -; SSE-NEXT: pmullw -; AVX2: vpmullw -; AVX2-NEXT: vpmullw -; CHECK: ret - ; Similar to test7; the difference is that with AVX512 support ; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq. define <16 x i32> @test8(<16 x i32> %a) { +; SSE-LABEL: test8: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8] +; SSE-NEXT: pmulld %xmm4, %xmm0 +; SSE-NEXT: pmulld %xmm4, %xmm1 +; SSE-NEXT: pmulld %xmm4, %xmm2 +; SSE-NEXT: pmulld %xmm4, %xmm3 +; SSE-NEXT: retq +; +; AVX2-LABEL: test8: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3] +; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test8: +; AVX512: # BB#0: +; AVX512-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: retq %shl = shl <16 x i32> %a, ret <16 x i32> %shl } -; CHECK-LABEL: test8 -; SSE: pmulld -; SSE-NEXT: pmulld -; SSE-NEXT: pmulld -; SSE-NEXT: pmulld -; AVX2ONLY: vpsllvd -; AVX2ONLY-NEXT: vpsllvd -; AVX512: vpsllvd -; AVX512-NOT: vpsllvd -; CHECK: ret - -; The shift from 'test9' gets scalarized if we don't have AVX2/AVX512f support. +; The shift from 'test9' gets shifted separately and blended if we don't have AVX2/AVX512f support. define <8 x i64> @test9(<8 x i64> %a) { +; SSE-LABEL: test9: +; SSE: # BB#0: +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psllq $3, %xmm4 +; SSE-NEXT: psllq $2, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psllq $3, %xmm4 +; SSE-NEXT: psllq $2, %xmm3 +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] +; SSE-NEXT: paddq %xmm0, %xmm0 +; SSE-NEXT: paddq %xmm2, %xmm2 +; SSE-NEXT: retq +; +; AVX2-LABEL: test9: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3] +; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsllvq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test9: +; AVX512: # BB#0: +; AVX512-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: retq %shl = shl <8 x i64> %a, ret <8 x i64> %shl } -; CHECK-LABEL: test9 -; AVX2ONLY: vpsllvq -; AVX2ONLY-NEXT: vpsllvq -; AVX512: vpsllvq -; AVX512-NOT: vpsllvq -; CHECK: ret - diff --git a/llvm/test/CodeGen/X86/vec_shift7.ll b/llvm/test/CodeGen/X86/vec_shift7.ll index cdf828976be4..80d72a4a986f 100644 --- a/llvm/test/CodeGen/X86/vec_shift7.ll +++ b/llvm/test/CodeGen/X86/vec_shift7.ll @@ -1,12 +1,29 @@ -; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s - +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 ; Verify that we don't fail when shift by zero is encountered. define i64 @test1(<2 x i64> %a) { +; X32-LABEL: test1: +; X32: # BB#0: # %entry +; X32-NEXT: movdqa %xmm0, %xmm1 +; X32-NEXT: psllq $2, %xmm1 +; X32-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; X32-NEXT: movd %xmm1, %eax +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X32-NEXT: movd %xmm0, %edx +; X32-NEXT: retl +; +; X64-LABEL: test1: +; X64: # BB#0: # %entry +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: psllq $2, %xmm1 +; X64-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; X64-NEXT: movd %xmm1, %rax +; X64-NEXT: retq entry: %c = shl <2 x i64> %a, %d = extractelement <2 x i64> %c, i32 0 ret i64 %d } -; CHECK-LABEL: test1