[X86][SSE] Simplify demanded elements from BROADCAST shuffle source.
If broadcasting from another shuffle, attempt to simplify it. We can probably generalize this a lot more (embedding in combineX86ShufflesRecursively), but BROADCAST is one of the more troublesome as it accepts inputs of different sizes to the result. llvm-svn: 323602
This commit is contained in:
parent
73e88d394b
commit
fe3fac805a
|
|
@ -28242,6 +28242,14 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
|
|||
unsigned NumMaskElts = Mask.size();
|
||||
unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
|
||||
|
||||
// Match against a VZEXT_MOVL vXi32 zero-extending instruction.
|
||||
if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
|
||||
isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
|
||||
Shuffle = X86ISD::VZEXT_MOVL;
|
||||
SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
|
||||
// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
|
||||
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
|
||||
|
|
@ -29790,6 +29798,28 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
|
|||
}
|
||||
|
||||
switch (Opcode) {
|
||||
case X86ISD::VBROADCAST: {
|
||||
// If broadcasting from another shuffle, attempt to simplify it.
|
||||
// TODO - we really need a general SimplifyDemandedVectorElts mechanism.
|
||||
SDValue Src = N.getOperand(0);
|
||||
SDValue BC = peekThroughBitcasts(Src);
|
||||
EVT SrcVT = Src.getValueType();
|
||||
EVT BCVT = BC.getValueType();
|
||||
if (isTargetShuffle(BC.getOpcode()) &&
|
||||
VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
|
||||
unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
|
||||
SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
|
||||
SM_SentinelUndef);
|
||||
for (unsigned i = 0; i != Scale; ++i)
|
||||
DemandedMask[i] = i;
|
||||
if (SDValue Res = combineX86ShufflesRecursively(
|
||||
{BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1,
|
||||
/*HasVarMask*/ false, DAG, DCI, Subtarget))
|
||||
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
|
||||
DAG.getBitcast(SrcVT, Res));
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
case X86ISD::PSHUFD:
|
||||
case X86ISD::PSHUFLW:
|
||||
case X86ISD::PSHUFHW:
|
||||
|
|
|
|||
|
|
@ -459,9 +459,7 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask3(<2 x i32>* %vp, <4 x i3
|
|||
define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) {
|
||||
; CHECK-LABEL: test_2xi32_to_8xi32_mem:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
|
||||
; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0
|
||||
; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%vec = load <2 x i32>, <2 x i32>* %vp
|
||||
%res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
|
||||
|
|
@ -470,11 +468,9 @@ define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) {
|
|||
define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
|
||||
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask0:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
|
||||
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
|
||||
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
|
||||
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
|
||||
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1]
|
||||
; CHECK-NEXT: retq
|
||||
%vec = load <2 x i32>, <2 x i32>* %vp
|
||||
%shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
|
||||
|
|
@ -486,11 +482,9 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32>
|
|||
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %mask) {
|
||||
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask0:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
|
||||
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
|
||||
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
|
||||
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
|
||||
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
|
||||
; CHECK-NEXT: retq
|
||||
%vec = load <2 x i32>, <2 x i32>* %vp
|
||||
%shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
|
||||
|
|
@ -501,11 +495,9 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i3
|
|||
define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
|
||||
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask1:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
|
||||
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
|
||||
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
|
||||
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
|
||||
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1]
|
||||
; CHECK-NEXT: retq
|
||||
%vec = load <2 x i32>, <2 x i32>* %vp
|
||||
%shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
|
||||
|
|
@ -517,11 +509,9 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32>
|
|||
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %mask) {
|
||||
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask1:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
|
||||
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
|
||||
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
|
||||
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
|
||||
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
|
||||
; CHECK-NEXT: retq
|
||||
%vec = load <2 x i32>, <2 x i32>* %vp
|
||||
%shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
|
||||
|
|
@ -532,11 +522,9 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i3
|
|||
define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
|
||||
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask2:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
|
||||
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
|
||||
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
|
||||
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
|
||||
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1]
|
||||
; CHECK-NEXT: retq
|
||||
%vec = load <2 x i32>, <2 x i32>* %vp
|
||||
%shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
|
||||
|
|
@ -548,11 +536,9 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32>
|
|||
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %mask) {
|
||||
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask2:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
|
||||
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
|
||||
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
|
||||
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
|
||||
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
|
||||
; CHECK-NEXT: retq
|
||||
%vec = load <2 x i32>, <2 x i32>* %vp
|
||||
%shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
|
||||
|
|
@ -563,11 +549,9 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i3
|
|||
define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
|
||||
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask3:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
|
||||
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
|
||||
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
|
||||
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
|
||||
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1]
|
||||
; CHECK-NEXT: retq
|
||||
%vec = load <2 x i32>, <2 x i32>* %vp
|
||||
%shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
|
||||
|
|
@ -579,11 +563,9 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32>
|
|||
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %mask) {
|
||||
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask3:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
|
||||
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
|
||||
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
|
||||
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
|
||||
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
|
||||
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
|
||||
; CHECK-NEXT: retq
|
||||
%vec = load <2 x i32>, <2 x i32>* %vp
|
||||
%shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
|
||||
|
|
|
|||
|
|
@ -203,7 +203,7 @@ define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
|
|||
;
|
||||
; AVX2OR512VL-LABEL: shuffle_v8f32_08080808:
|
||||
; AVX2OR512VL: # %bb.0:
|
||||
; AVX2OR512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
|
||||
; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX2OR512VL-NEXT: vbroadcastsd %xmm0, %ymm0
|
||||
; AVX2OR512VL-NEXT: retq
|
||||
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
|
||||
|
|
|
|||
|
|
@ -345,7 +345,7 @@ define void @buildvector_v4f32_0404(float %a, float %b, <4 x float>* %ptr) {
|
|||
;
|
||||
; X64AVX2-LABEL: buildvector_v4f32_0404:
|
||||
; X64AVX2: # %bb.0:
|
||||
; X64AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
||||
; X64AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; X64AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
|
||||
; X64AVX2-NEXT: vmovapd %xmm0, (%rdi)
|
||||
; X64AVX2-NEXT: retq
|
||||
|
|
|
|||
|
|
@ -606,16 +606,12 @@ define <8 x i32> @load_splat_8i32_2i32_0101(<2 x i32>* %vp) {
|
|||
;
|
||||
; AVX2-LABEL: load_splat_8i32_2i32_0101:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
|
||||
; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: load_splat_8i32_2i32_0101:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
|
||||
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
|
||||
; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
|
||||
; AVX512-NEXT: retq
|
||||
%vec = load <2 x i32>, <2 x i32>* %vp
|
||||
%res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
|
||||
|
|
@ -642,10 +638,8 @@ define <16 x i32> @load_splat_16i32_2i32_0101(<2 x i32>* %vp) {
|
|||
;
|
||||
; AVX2-LABEL: load_splat_16i32_2i32_0101:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
|
||||
; AVX2-NEXT: vmovdqa %ymm0, %ymm1
|
||||
; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
|
||||
; AVX2-NEXT: vmovaps %ymm0, %ymm1
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: load_splat_16i32_2i32_0101:
|
||||
|
|
|
|||
Loading…
Reference in New Issue