[X86] Improve sum-of-reductions v4f32 test coverage

Ensure that the v4f32 reductions use a -0.0f start value and add fast-math test variant.
This commit is contained in:
Simon Pilgrim 2021-01-14 11:05:04 +00:00
parent af8d27a7a8
commit 8f1d7f3753
1 changed files with 232 additions and 76 deletions

View File

@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3-FAST
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX-SLOW,AVX1-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX-FAST,AVX1-FAST
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-SLOW,AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX-FAST,AVX2-FAST
; Vectorized Pairwise Sum Reductions
; e.g.
@ -954,77 +954,137 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; }
define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
; SSSE3-LABEL: reduction_sum_v4f32_v4f32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3]
; SSSE3-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
; SSSE3-NEXT: addss %xmm4, %xmm5
; SSSE3-NEXT: movaps %xmm0, %xmm6
; SSSE3-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1]
; SSSE3-NEXT: addss %xmm5, %xmm6
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSSE3-NEXT: addss %xmm6, %xmm0
; SSSE3-NEXT: movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
; SSSE3-NEXT: addss %xmm4, %xmm5
; SSSE3-NEXT: movaps %xmm1, %xmm6
; SSSE3-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1]
; SSSE3-NEXT: addss %xmm5, %xmm6
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; SSSE3-NEXT: addss %xmm6, %xmm1
; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSSE3-NEXT: addss %xmm4, %xmm1
; SSSE3-NEXT: movaps %xmm2, %xmm5
; SSSE3-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
; SSSE3-NEXT: addss %xmm1, %xmm5
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; SSSE3-NEXT: addss %xmm5, %xmm2
; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
; SSSE3-NEXT: addss %xmm4, %xmm1
; SSSE3-NEXT: movaps %xmm3, %xmm4
; SSSE3-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
; SSSE3-NEXT: addss %xmm1, %xmm4
; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
; SSSE3-NEXT: addss %xmm4, %xmm3
; SSSE3-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-NEXT: retq
; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32:
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSSE3-SLOW-NEXT: addss %xmm0, %xmm4
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
; SSSE3-SLOW-NEXT: addss %xmm4, %xmm5
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSSE3-SLOW-NEXT: addss %xmm5, %xmm0
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4
; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
; SSSE3-SLOW-NEXT: addss %xmm4, %xmm5
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; SSSE3-SLOW-NEXT: addss %xmm5, %xmm1
; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSSE3-SLOW-NEXT: addss %xmm2, %xmm1
; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm4
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; SSSE3-SLOW-NEXT: addss %xmm4, %xmm2
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
; SSSE3-SLOW-NEXT: addss %xmm3, %xmm1
; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm4
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
; SSSE3-SLOW-NEXT: addss %xmm4, %xmm3
; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-SLOW-NEXT: retq
;
; AVX-LABEL: reduction_sum_v4f32_v4f32:
; AVX: # %bb.0:
; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
; AVX-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
; AVX-NEXT: vaddss %xmm5, %xmm4, %xmm4
; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
; AVX-NEXT: vaddss %xmm6, %xmm4, %xmm4
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vaddss %xmm0, %xmm4, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
; AVX-NEXT: vaddss %xmm5, %xmm4, %xmm4
; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
; AVX-NEXT: vaddss %xmm6, %xmm4, %xmm4
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX-NEXT: vaddss %xmm1, %xmm4, %xmm1
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
; AVX-NEXT: vaddss %xmm5, %xmm1, %xmm1
; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
; AVX-NEXT: vaddss %xmm4, %xmm1, %xmm1
; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
; AVX-NEXT: vaddss %xmm5, %xmm1, %xmm1
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3]
; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX-NEXT: retq
%5 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %0)
%6 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %1)
%7 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %2)
%8 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %3)
; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32:
; SSSE3-FAST: # %bb.0:
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSSE3-FAST-NEXT: addss %xmm5, %xmm0
; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4
; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4
; SSSE3-FAST-NEXT: movaps %xmm1, %xmm5
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; SSSE3-FAST-NEXT: addss %xmm5, %xmm1
; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
; SSSE3-FAST-NEXT: addss %xmm1, %xmm4
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; SSSE3-FAST-NEXT: addss %xmm4, %xmm2
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm4
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
; SSSE3-FAST-NEXT: addss %xmm1, %xmm4
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
; SSSE3-FAST-NEXT: addss %xmm4, %xmm3
; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-FAST-NEXT: retq
;
; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
; AVX-SLOW-NEXT: vaddss %xmm4, %xmm0, %xmm4
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0]
; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-SLOW-NEXT: vaddss %xmm0, %xmm4, %xmm0
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm4
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX-SLOW-NEXT: vaddss %xmm1, %xmm4, %xmm1
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
; AVX-SLOW-NEXT: vaddss %xmm1, %xmm2, %xmm1
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
; AVX-SLOW-NEXT: vaddss %xmm1, %xmm3, %xmm1
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3]
; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm4
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0]
; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-FAST-NEXT: vaddss %xmm0, %xmm4, %xmm0
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm4
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1
; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3]
; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX-FAST-NEXT: retq
%5 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
%6 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
%7 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2)
%8 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3)
%9 = insertelement <4 x float> undef, float %5, i32 0
%10 = insertelement <4 x float> %9, float %6, i32 1
%11 = insertelement <4 x float> %10, float %7, i32 2
@ -1033,6 +1093,102 @@ define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4
}
declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4
; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm4
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
; SSSE3-SLOW-NEXT: addps %xmm4, %xmm1
; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm2
; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3
; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
; SSSE3-FAST: # %bb.0:
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
; SSSE3-FAST-NEXT: addps %xmm0, %xmm4
; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSSE3-FAST-NEXT: addps %xmm1, %xmm0
; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4
; SSSE3-FAST-NEXT: movaps %xmm2, %xmm0
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
; SSSE3-FAST-NEXT: addps %xmm2, %xmm0
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
; SSSE3-FAST-NEXT: addps %xmm3, %xmm1
; SSSE3-FAST-NEXT: haddps %xmm0, %xmm1
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,0]
; SSSE3-FAST-NEXT: movaps %xmm4, %xmm0
; SSSE3-FAST-NEXT: retq
;
; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1
; AVX-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
; AVX-SLOW-NEXT: vaddps %xmm2, %xmm3, %xmm2
; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
; AVX-SLOW-NEXT: vaddps %xmm3, %xmm2, %xmm2
; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; AVX-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
; AVX-FAST-NEXT: vaddps %xmm4, %xmm0, %xmm0
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
; AVX-FAST-NEXT: vaddps %xmm4, %xmm1, %xmm1
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
; AVX-FAST-NEXT: vaddps %xmm2, %xmm3, %xmm2
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,0]
; AVX-FAST-NEXT: retq
%5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
%6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
%7 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2)
%8 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3)
%9 = insertelement <4 x float> undef, float %5, i32 0
%10 = insertelement <4 x float> %9, float %6, i32 1
%11 = insertelement <4 x float> %10, float %7, i32 2
%12 = insertelement <4 x float> %11, float %8, i32 3
ret <4 x float> %12
}
define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32:
; SSSE3-SLOW: # %bb.0: