[X86] Improve sum-of-reductions v4f32 test coverage
Ensure that the v4f32 reductions use a -0.0f start value and add fast-math test variant.
This commit is contained in:
parent
af8d27a7a8
commit
8f1d7f3753
|
|
@ -1,10 +1,10 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2-SLOW
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2-FAST
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3-SLOW
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3-FAST
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX-SLOW,AVX1-SLOW
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX-FAST,AVX1-FAST
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-SLOW,AVX2-SLOW
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX-FAST,AVX2-FAST
|
||||
|
||||
; Vectorized Pairwise Sum Reductions
|
||||
; e.g.
|
||||
|
|
@ -954,77 +954,137 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
|
|||
; }
|
||||
|
||||
define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
|
||||
; SSSE3-LABEL: reduction_sum_v4f32_v4f32:
|
||||
; SSSE3: # %bb.0:
|
||||
; SSSE3-NEXT: movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3]
|
||||
; SSSE3-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: addss %xmm4, %xmm5
|
||||
; SSSE3-NEXT: movaps %xmm0, %xmm6
|
||||
; SSSE3-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1]
|
||||
; SSSE3-NEXT: addss %xmm5, %xmm6
|
||||
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; SSSE3-NEXT: addss %xmm6, %xmm0
|
||||
; SSSE3-NEXT: movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
|
||||
; SSSE3-NEXT: addss %xmm4, %xmm5
|
||||
; SSSE3-NEXT: movaps %xmm1, %xmm6
|
||||
; SSSE3-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1]
|
||||
; SSSE3-NEXT: addss %xmm5, %xmm6
|
||||
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
|
||||
; SSSE3-NEXT: addss %xmm6, %xmm1
|
||||
; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
|
||||
; SSSE3-NEXT: addss %xmm4, %xmm1
|
||||
; SSSE3-NEXT: movaps %xmm2, %xmm5
|
||||
; SSSE3-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
|
||||
; SSSE3-NEXT: addss %xmm1, %xmm5
|
||||
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
|
||||
; SSSE3-NEXT: addss %xmm5, %xmm2
|
||||
; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
|
||||
; SSSE3-NEXT: addss %xmm4, %xmm1
|
||||
; SSSE3-NEXT: movaps %xmm3, %xmm4
|
||||
; SSSE3-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
|
||||
; SSSE3-NEXT: addss %xmm1, %xmm4
|
||||
; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
|
||||
; SSSE3-NEXT: addss %xmm4, %xmm3
|
||||
; SSSE3-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
||||
; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
||||
; SSSE3-NEXT: retq
|
||||
; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32:
|
||||
; SSSE3-SLOW: # %bb.0:
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addss %xmm0, %xmm4
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
|
||||
; SSSE3-SLOW-NEXT: addss %xmm4, %xmm5
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; SSSE3-SLOW-NEXT: addss %xmm5, %xmm0
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
|
||||
; SSSE3-SLOW-NEXT: addss %xmm4, %xmm5
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
|
||||
; SSSE3-SLOW-NEXT: addss %xmm5, %xmm1
|
||||
; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addss %xmm2, %xmm1
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm4
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
|
||||
; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
|
||||
; SSSE3-SLOW-NEXT: addss %xmm4, %xmm2
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addss %xmm3, %xmm1
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm4
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
|
||||
; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
|
||||
; SSSE3-SLOW-NEXT: addss %xmm4, %xmm3
|
||||
; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
||||
; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
||||
; SSSE3-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: reduction_sum_v4f32_v4f32:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
|
||||
; AVX-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
|
||||
; AVX-NEXT: vaddss %xmm5, %xmm4, %xmm4
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
|
||||
; AVX-NEXT: vaddss %xmm6, %xmm4, %xmm4
|
||||
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX-NEXT: vaddss %xmm0, %xmm4, %xmm0
|
||||
; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
|
||||
; AVX-NEXT: vaddss %xmm5, %xmm4, %xmm4
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
|
||||
; AVX-NEXT: vaddss %xmm6, %xmm4, %xmm4
|
||||
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
|
||||
; AVX-NEXT: vaddss %xmm1, %xmm4, %xmm1
|
||||
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
||||
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
|
||||
; AVX-NEXT: vaddss %xmm5, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
|
||||
; AVX-NEXT: vaddss %xmm4, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
|
||||
; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
|
||||
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
|
||||
; AVX-NEXT: vaddss %xmm5, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
|
||||
; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3]
|
||||
; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
|
||||
; AVX-NEXT: retq
|
||||
%5 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %0)
|
||||
%6 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %1)
|
||||
%7 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %2)
|
||||
%8 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %3)
|
||||
; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32:
|
||||
; SSSE3-FAST: # %bb.0:
|
||||
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
|
||||
; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4
|
||||
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5
|
||||
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
|
||||
; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; SSSE3-FAST-NEXT: addss %xmm5, %xmm0
|
||||
; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4
|
||||
; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4
|
||||
; SSSE3-FAST-NEXT: movaps %xmm1, %xmm5
|
||||
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
|
||||
; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
|
||||
; SSSE3-FAST-NEXT: addss %xmm5, %xmm1
|
||||
; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
|
||||
; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
|
||||
; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4
|
||||
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
|
||||
; SSSE3-FAST-NEXT: addss %xmm1, %xmm4
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
|
||||
; SSSE3-FAST-NEXT: addss %xmm4, %xmm2
|
||||
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
|
||||
; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1
|
||||
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm4
|
||||
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
|
||||
; SSSE3-FAST-NEXT: addss %xmm1, %xmm4
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
|
||||
; SSSE3-FAST-NEXT: addss %xmm4, %xmm3
|
||||
; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
||||
; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
|
||||
; SSSE3-FAST-NEXT: retq
|
||||
;
|
||||
; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32:
|
||||
; AVX-SLOW: # %bb.0:
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vaddss %xmm4, %xmm0, %xmm4
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0]
|
||||
; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4
|
||||
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX-SLOW-NEXT: vaddss %xmm0, %xmm4, %xmm0
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm4
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
|
||||
; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4
|
||||
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
|
||||
; AVX-SLOW-NEXT: vaddss %xmm1, %xmm4, %xmm1
|
||||
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vaddss %xmm1, %xmm2, %xmm1
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
|
||||
; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
|
||||
; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vaddss %xmm1, %xmm3, %xmm1
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
|
||||
; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3]
|
||||
; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
|
||||
; AVX-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32:
|
||||
; AVX-FAST: # %bb.0:
|
||||
; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm4
|
||||
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0]
|
||||
; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
|
||||
; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX-FAST-NEXT: vaddss %xmm0, %xmm4, %xmm0
|
||||
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm4
|
||||
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
|
||||
; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
|
||||
; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
|
||||
; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1
|
||||
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
||||
; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
|
||||
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
|
||||
; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1
|
||||
; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
|
||||
; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
|
||||
; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1
|
||||
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
|
||||
; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3]
|
||||
; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
|
||||
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
|
||||
; AVX-FAST-NEXT: retq
|
||||
%5 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
|
||||
%6 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
|
||||
%7 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2)
|
||||
%8 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3)
|
||||
%9 = insertelement <4 x float> undef, float %5, i32 0
|
||||
%10 = insertelement <4 x float> %9, float %6, i32 1
|
||||
%11 = insertelement <4 x float> %10, float %7, i32 2
|
||||
|
|
@ -1033,6 +1093,102 @@ define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4
|
|||
}
|
||||
declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
|
||||
|
||||
define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
|
||||
; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
|
||||
; SSSE3-SLOW: # %bb.0:
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4
|
||||
; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm4
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm4, %xmm1
|
||||
; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm2
|
||||
; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1
|
||||
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1
|
||||
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
|
||||
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3
|
||||
; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
|
||||
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
|
||||
; SSSE3-SLOW-NEXT: retq
|
||||
;
|
||||
; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
|
||||
; SSSE3-FAST: # %bb.0:
|
||||
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
|
||||
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
|
||||
; SSSE3-FAST-NEXT: addps %xmm0, %xmm4
|
||||
; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0
|
||||
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
|
||||
; SSSE3-FAST-NEXT: addps %xmm1, %xmm0
|
||||
; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4
|
||||
; SSSE3-FAST-NEXT: movaps %xmm2, %xmm0
|
||||
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
|
||||
; SSSE3-FAST-NEXT: addps %xmm2, %xmm0
|
||||
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
|
||||
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
|
||||
; SSSE3-FAST-NEXT: addps %xmm3, %xmm1
|
||||
; SSSE3-FAST-NEXT: haddps %xmm0, %xmm1
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,0]
|
||||
; SSSE3-FAST-NEXT: movaps %xmm4, %xmm0
|
||||
; SSSE3-FAST-NEXT: retq
|
||||
;
|
||||
; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
|
||||
; AVX-SLOW: # %bb.0:
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
|
||||
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm2, %xmm3, %xmm2
|
||||
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
|
||||
; AVX-SLOW-NEXT: vaddps %xmm3, %xmm2, %xmm2
|
||||
; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
||||
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
|
||||
; AVX-SLOW-NEXT: retq
|
||||
;
|
||||
; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
|
||||
; AVX-FAST: # %bb.0:
|
||||
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
|
||||
; AVX-FAST-NEXT: vaddps %xmm4, %xmm0, %xmm0
|
||||
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
|
||||
; AVX-FAST-NEXT: vaddps %xmm4, %xmm1, %xmm1
|
||||
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
|
||||
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
|
||||
; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
|
||||
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
|
||||
; AVX-FAST-NEXT: vaddps %xmm2, %xmm3, %xmm2
|
||||
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1
|
||||
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,0]
|
||||
; AVX-FAST-NEXT: retq
|
||||
%5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
|
||||
%6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
|
||||
%7 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2)
|
||||
%8 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3)
|
||||
%9 = insertelement <4 x float> undef, float %5, i32 0
|
||||
%10 = insertelement <4 x float> %9, float %6, i32 1
|
||||
%11 = insertelement <4 x float> %10, float %7, i32 2
|
||||
%12 = insertelement <4 x float> %11, float %8, i32 3
|
||||
ret <4 x float> %12
|
||||
}
|
||||
|
||||
define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
|
||||
; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32:
|
||||
; SSSE3-SLOW: # %bb.0:
|
||||
|
|
|
|||
Loading…
Reference in New Issue