[X86] Improve sum-of-reductions v4f32 test coverage
Ensure that the v4f32 reductions use a -0.0f start value and add fast-math test variant.
This commit is contained in:
		
							parent
							
								
									af8d27a7a8
								
							
						
					
					
						commit
						8f1d7f3753
					
				| 
						 | 
					@ -1,10 +1,10 @@
 | 
				
			||||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 | 
					; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 | 
				
			||||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW
 | 
					; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSSE3-SLOW
 | 
				
			||||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST
 | 
					; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3-FAST
 | 
				
			||||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
 | 
					; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX-SLOW,AVX1-SLOW
 | 
				
			||||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST
 | 
					; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX-FAST,AVX1-FAST
 | 
				
			||||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2-SLOW
 | 
					; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX-SLOW,AVX2-SLOW
 | 
				
			||||||
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2-FAST
 | 
					; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX-FAST,AVX2-FAST
 | 
				
			||||||
 | 
					
 | 
				
			||||||
; Vectorized Pairwise Sum Reductions
 | 
					; Vectorized Pairwise Sum Reductions
 | 
				
			||||||
; e.g.
 | 
					; e.g.
 | 
				
			||||||
| 
						 | 
					@ -954,77 +954,137 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
 | 
				
			||||||
; }
 | 
					; }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
 | 
					define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
 | 
				
			||||||
; SSSE3-LABEL: reduction_sum_v4f32_v4f32:
 | 
					; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32:
 | 
				
			||||||
; SSSE3:       # %bb.0:
 | 
					; SSSE3-SLOW:       # %bb.0:
 | 
				
			||||||
; SSSE3-NEXT:    movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3]
 | 
					; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
 | 
				
			||||||
; SSSE3-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
 | 
					; SSSE3-SLOW-NEXT:    addss %xmm0, %xmm4
 | 
				
			||||||
; SSSE3-NEXT:    addss %xmm4, %xmm5
 | 
					; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm5
 | 
				
			||||||
; SSSE3-NEXT:    movaps %xmm0, %xmm6
 | 
					; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
 | 
				
			||||||
; SSSE3-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1]
 | 
					; SSSE3-SLOW-NEXT:    addss %xmm4, %xmm5
 | 
				
			||||||
; SSSE3-NEXT:    addss %xmm5, %xmm6
 | 
					; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 | 
				
			||||||
; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 | 
					; SSSE3-SLOW-NEXT:    addss %xmm5, %xmm0
 | 
				
			||||||
; SSSE3-NEXT:    addss %xmm6, %xmm0
 | 
					; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
 | 
				
			||||||
; SSSE3-NEXT:    movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
 | 
					; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm4
 | 
				
			||||||
; SSSE3-NEXT:    addss %xmm4, %xmm5
 | 
					; SSSE3-SLOW-NEXT:    movaps %xmm1, %xmm5
 | 
				
			||||||
; SSSE3-NEXT:    movaps %xmm1, %xmm6
 | 
					; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
 | 
				
			||||||
; SSSE3-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1]
 | 
					; SSSE3-SLOW-NEXT:    addss %xmm4, %xmm5
 | 
				
			||||||
; SSSE3-NEXT:    addss %xmm5, %xmm6
 | 
					; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
 | 
				
			||||||
; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
 | 
					; SSSE3-SLOW-NEXT:    addss %xmm5, %xmm1
 | 
				
			||||||
; SSSE3-NEXT:    addss %xmm6, %xmm1
 | 
					; SSSE3-SLOW-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 | 
				
			||||||
; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 | 
					; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
 | 
				
			||||||
; SSSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
 | 
					; SSSE3-SLOW-NEXT:    addss %xmm2, %xmm1
 | 
				
			||||||
; SSSE3-NEXT:    addss %xmm4, %xmm1
 | 
					; SSSE3-SLOW-NEXT:    movaps %xmm2, %xmm4
 | 
				
			||||||
; SSSE3-NEXT:    movaps %xmm2, %xmm5
 | 
					; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
 | 
				
			||||||
; SSSE3-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
 | 
					; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm4
 | 
				
			||||||
; SSSE3-NEXT:    addss %xmm1, %xmm5
 | 
					; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
 | 
				
			||||||
; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
 | 
					; SSSE3-SLOW-NEXT:    addss %xmm4, %xmm2
 | 
				
			||||||
; SSSE3-NEXT:    addss %xmm5, %xmm2
 | 
					; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
 | 
				
			||||||
; SSSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
 | 
					; SSSE3-SLOW-NEXT:    addss %xmm3, %xmm1
 | 
				
			||||||
; SSSE3-NEXT:    addss %xmm4, %xmm1
 | 
					; SSSE3-SLOW-NEXT:    movaps %xmm3, %xmm4
 | 
				
			||||||
; SSSE3-NEXT:    movaps %xmm3, %xmm4
 | 
					; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
 | 
				
			||||||
; SSSE3-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
 | 
					; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm4
 | 
				
			||||||
; SSSE3-NEXT:    addss %xmm1, %xmm4
 | 
					; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
 | 
				
			||||||
; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
 | 
					; SSSE3-SLOW-NEXT:    addss %xmm4, %xmm3
 | 
				
			||||||
; SSSE3-NEXT:    addss %xmm4, %xmm3
 | 
					; SSSE3-SLOW-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 | 
				
			||||||
; SSSE3-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 | 
					; SSSE3-SLOW-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 | 
				
			||||||
; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 | 
					; SSSE3-SLOW-NEXT:    retq
 | 
				
			||||||
; SSSE3-NEXT:    retq
 | 
					 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX-LABEL: reduction_sum_v4f32_v4f32:
 | 
					; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32:
 | 
				
			||||||
; AVX:       # %bb.0:
 | 
					; SSSE3-FAST:       # %bb.0:
 | 
				
			||||||
; AVX-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
 | 
					; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm4
 | 
				
			||||||
; AVX-NEXT:    vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
 | 
					; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm4
 | 
				
			||||||
; AVX-NEXT:    vaddss %xmm5, %xmm4, %xmm4
 | 
					; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm5
 | 
				
			||||||
; AVX-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
 | 
					; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
 | 
				
			||||||
; AVX-NEXT:    vaddss %xmm6, %xmm4, %xmm4
 | 
					; SSSE3-FAST-NEXT:    addss %xmm4, %xmm5
 | 
				
			||||||
; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 | 
					; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 | 
				
			||||||
; AVX-NEXT:    vaddss %xmm0, %xmm4, %xmm0
 | 
					; SSSE3-FAST-NEXT:    addss %xmm5, %xmm0
 | 
				
			||||||
; AVX-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
 | 
					; SSSE3-FAST-NEXT:    movaps %xmm1, %xmm4
 | 
				
			||||||
; AVX-NEXT:    vaddss %xmm5, %xmm4, %xmm4
 | 
					; SSSE3-FAST-NEXT:    haddps %xmm1, %xmm4
 | 
				
			||||||
; AVX-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
 | 
					; SSSE3-FAST-NEXT:    movaps %xmm1, %xmm5
 | 
				
			||||||
; AVX-NEXT:    vaddss %xmm6, %xmm4, %xmm4
 | 
					; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
 | 
				
			||||||
; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
 | 
					; SSSE3-FAST-NEXT:    addss %xmm4, %xmm5
 | 
				
			||||||
; AVX-NEXT:    vaddss %xmm1, %xmm4, %xmm1
 | 
					; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
 | 
				
			||||||
; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
 | 
					; SSSE3-FAST-NEXT:    addss %xmm5, %xmm1
 | 
				
			||||||
; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
 | 
					; SSSE3-FAST-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 | 
				
			||||||
; AVX-NEXT:    vaddss %xmm5, %xmm1, %xmm1
 | 
					; SSSE3-FAST-NEXT:    movaps %xmm2, %xmm1
 | 
				
			||||||
; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
 | 
					; SSSE3-FAST-NEXT:    haddps %xmm2, %xmm1
 | 
				
			||||||
; AVX-NEXT:    vaddss %xmm4, %xmm1, %xmm1
 | 
					; SSSE3-FAST-NEXT:    movaps %xmm2, %xmm4
 | 
				
			||||||
; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
 | 
					; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
 | 
				
			||||||
; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
 | 
					; SSSE3-FAST-NEXT:    addss %xmm1, %xmm4
 | 
				
			||||||
; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
 | 
					; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
 | 
				
			||||||
; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
 | 
					; SSSE3-FAST-NEXT:    addss %xmm4, %xmm2
 | 
				
			||||||
; AVX-NEXT:    vaddss %xmm5, %xmm1, %xmm1
 | 
					; SSSE3-FAST-NEXT:    movaps %xmm3, %xmm1
 | 
				
			||||||
; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
 | 
					; SSSE3-FAST-NEXT:    haddps %xmm3, %xmm1
 | 
				
			||||||
; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
 | 
					; SSSE3-FAST-NEXT:    movaps %xmm3, %xmm4
 | 
				
			||||||
; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3]
 | 
					; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
 | 
				
			||||||
; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
 | 
					; SSSE3-FAST-NEXT:    addss %xmm1, %xmm4
 | 
				
			||||||
; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 | 
					; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
 | 
				
			||||||
; AVX-NEXT:    retq
 | 
					; SSSE3-FAST-NEXT:    addss %xmm4, %xmm3
 | 
				
			||||||
  %5 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %0)
 | 
					; SSSE3-FAST-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 | 
				
			||||||
  %6 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %1)
 | 
					; SSSE3-FAST-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 | 
				
			||||||
  %7 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %2)
 | 
					; SSSE3-FAST-NEXT:    retq
 | 
				
			||||||
  %8 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %3)
 | 
					;
 | 
				
			||||||
 | 
					; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32:
 | 
				
			||||||
 | 
					; AVX-SLOW:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddss %xmm4, %xmm0, %xmm4
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm0[1,0]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddss %xmm5, %xmm4, %xmm4
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddss %xmm0, %xmm4, %xmm0
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddss %xmm4, %xmm1, %xmm4
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddss %xmm5, %xmm4, %xmm4
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm4, %xmm1
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm2, %xmm1
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddss %xmm4, %xmm1, %xmm1
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm3, %xmm1
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddss %xmm2, %xmm1, %xmm1
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32:
 | 
				
			||||||
 | 
					; AVX-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm4
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm0[1,0]
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vaddss %xmm5, %xmm4, %xmm4
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vaddss %xmm0, %xmm4, %xmm0
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm4
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vaddss %xmm5, %xmm4, %xmm4
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vaddss %xmm1, %xmm4, %xmm1
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vhaddps %xmm2, %xmm2, %xmm1
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vaddss %xmm4, %xmm1, %xmm1
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3]
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vhaddps %xmm3, %xmm3, %xmm1
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3]
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vaddss %xmm2, %xmm1, %xmm1
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    retq
 | 
				
			||||||
 | 
					  %5 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
 | 
				
			||||||
 | 
					  %6 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
 | 
				
			||||||
 | 
					  %7 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2)
 | 
				
			||||||
 | 
					  %8 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3)
 | 
				
			||||||
  %9 = insertelement <4 x float> undef, float %5, i32 0
 | 
					  %9 = insertelement <4 x float> undef, float %5, i32 0
 | 
				
			||||||
  %10 = insertelement <4 x float> %9,   float %6, i32 1
 | 
					  %10 = insertelement <4 x float> %9,   float %6, i32 1
 | 
				
			||||||
  %11 = insertelement <4 x float> %10,  float %7, i32 2
 | 
					  %11 = insertelement <4 x float> %10,  float %7, i32 2
 | 
				
			||||||
| 
						 | 
					@ -1033,6 +1093,102 @@ define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
 | 
					declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
 | 
				
			||||||
 | 
					; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
 | 
				
			||||||
 | 
					; SSSE3-SLOW:       # %bb.0:
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm4
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    addps %xmm4, %xmm0
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    addps %xmm4, %xmm0
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    movaps %xmm1, %xmm4
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm4
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    addps %xmm4, %xmm1
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    movaps %xmm2, %xmm1
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    addps %xmm2, %xmm1
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm2
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    movaps %xmm3, %xmm1
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    addps %xmm3, %xmm1
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm3
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
 | 
				
			||||||
 | 
					; SSSE3-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
 | 
				
			||||||
 | 
					; SSSE3-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm4
 | 
				
			||||||
 | 
					; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
 | 
				
			||||||
 | 
					; SSSE3-FAST-NEXT:    addps %xmm0, %xmm4
 | 
				
			||||||
 | 
					; SSSE3-FAST-NEXT:    movaps %xmm1, %xmm0
 | 
				
			||||||
 | 
					; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 | 
				
			||||||
 | 
					; SSSE3-FAST-NEXT:    addps %xmm1, %xmm0
 | 
				
			||||||
 | 
					; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm4
 | 
				
			||||||
 | 
					; SSSE3-FAST-NEXT:    movaps %xmm2, %xmm0
 | 
				
			||||||
 | 
					; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
 | 
				
			||||||
 | 
					; SSSE3-FAST-NEXT:    addps %xmm2, %xmm0
 | 
				
			||||||
 | 
					; SSSE3-FAST-NEXT:    movaps %xmm3, %xmm1
 | 
				
			||||||
 | 
					; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
 | 
				
			||||||
 | 
					; SSSE3-FAST-NEXT:    addps %xmm3, %xmm1
 | 
				
			||||||
 | 
					; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm1
 | 
				
			||||||
 | 
					; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,0]
 | 
				
			||||||
 | 
					; SSSE3-FAST-NEXT:    movaps %xmm4, %xmm0
 | 
				
			||||||
 | 
					; SSSE3-FAST-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
 | 
				
			||||||
 | 
					; AVX-SLOW:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddps %xmm4, %xmm0, %xmm0
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddps %xmm4, %xmm0, %xmm0
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddps %xmm4, %xmm1, %xmm1
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddps %xmm4, %xmm1, %xmm1
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm2, %xmm1
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddps %xmm2, %xmm3, %xmm2
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vaddps %xmm3, %xmm2, %xmm2
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
 | 
				
			||||||
 | 
					; AVX-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
 | 
				
			||||||
 | 
					; AVX-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vaddps %xmm4, %xmm0, %xmm0
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vaddps %xmm4, %xmm1, %xmm1
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vaddps %xmm1, %xmm2, %xmm1
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vaddps %xmm2, %xmm3, %xmm2
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm2, %xmm1
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,0]
 | 
				
			||||||
 | 
					; AVX-FAST-NEXT:    retq
 | 
				
			||||||
 | 
					  %5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
 | 
				
			||||||
 | 
					  %6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
 | 
				
			||||||
 | 
					  %7 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2)
 | 
				
			||||||
 | 
					  %8 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3)
 | 
				
			||||||
 | 
					  %9 = insertelement <4 x float> undef, float %5, i32 0
 | 
				
			||||||
 | 
					  %10 = insertelement <4 x float> %9,   float %6, i32 1
 | 
				
			||||||
 | 
					  %11 = insertelement <4 x float> %10,  float %7, i32 2
 | 
				
			||||||
 | 
					  %12 = insertelement <4 x float> %11,  float %8, i32 3
 | 
				
			||||||
 | 
					  ret <4 x float> %12
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
 | 
					define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) {
 | 
				
			||||||
; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32:
 | 
					; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32:
 | 
				
			||||||
; SSSE3-SLOW:       # %bb.0:
 | 
					; SSSE3-SLOW:       # %bb.0:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue