[X86][SSE] combineX86ShufflesRecursively - at Depth==0, only resolve KnownZero if it removes an input.
This stops infinite loops where KnownUndef elements are converted to Zeroable, resulting in KnownZero elements which are then simplified (via SimplifyDemandedElts etc.) back to KnownUndef elements........ Prep fix for PR43024 which will allow rL368307 to be re-applied.
This commit is contained in:
parent
3fbd1c00b0
commit
3f087e38a2
|
|
@ -6891,7 +6891,8 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
|
|||
// Replace target shuffle mask elements with known undef/zero sentinels.
|
||||
static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
|
||||
const APInt &KnownUndef,
|
||||
const APInt &KnownZero) {
|
||||
const APInt &KnownZero,
|
||||
bool ResolveKnownZeros= true) {
|
||||
unsigned NumElts = Mask.size();
|
||||
assert(KnownUndef.getBitWidth() == NumElts &&
|
||||
KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
|
||||
|
|
@ -6899,7 +6900,7 @@ static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
|
|||
for (unsigned i = 0; i != NumElts; ++i) {
|
||||
if (KnownUndef[i])
|
||||
Mask[i] = SM_SentinelUndef;
|
||||
else if (KnownZero[i])
|
||||
else if (ResolveKnownZeros && KnownZero[i])
|
||||
Mask[i] = SM_SentinelZero;
|
||||
}
|
||||
}
|
||||
|
|
@ -33071,17 +33072,36 @@ static SDValue combineX86ShufflesRecursively(
|
|||
OpZero, DAG, Depth, false))
|
||||
return SDValue();
|
||||
|
||||
resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
|
||||
|
||||
SmallVector<int, 64> Mask;
|
||||
SmallVector<SDValue, 16> Ops;
|
||||
|
||||
// We don't need to merge masks if the root is empty.
|
||||
bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
|
||||
if (EmptyRoot) {
|
||||
// Only resolve zeros if it will remove an input, otherwise we might end
|
||||
// up in an infinite loop.
|
||||
bool ResolveKnownZeros = true;
|
||||
if (!OpZero.isNullValue()) {
|
||||
APInt UsedInputs = APInt::getNullValue(OpInputs.size());
|
||||
for (int i = 0, e = OpMask.size(); i != e; ++i) {
|
||||
int M = OpMask[i];
|
||||
if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
|
||||
continue;
|
||||
UsedInputs.setBit(M / OpMask.size());
|
||||
if (UsedInputs.isAllOnesValue()) {
|
||||
ResolveKnownZeros = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
|
||||
ResolveKnownZeros);
|
||||
|
||||
Mask = OpMask;
|
||||
Ops.append(OpInputs.begin(), OpInputs.end());
|
||||
} else {
|
||||
resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
|
||||
|
||||
// Add the inputs to the Ops list, avoiding duplicates.
|
||||
Ops.append(SrcOps.begin(), SrcOps.end());
|
||||
|
||||
|
|
@ -33216,13 +33236,18 @@ static SDValue combineX86ShufflesRecursively(
|
|||
// the remaining recursion depth.
|
||||
if (Ops.size() < (MaxRecursionDepth - Depth)) {
|
||||
for (int i = 0, e = Ops.size(); i < e; ++i) {
|
||||
// For empty roots, we need to resolve zeroable elements before combining
|
||||
// them with other shuffles.
|
||||
SmallVector<int, 64> ResolvedMask = Mask;
|
||||
if (EmptyRoot)
|
||||
resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
|
||||
bool AllowVar = false;
|
||||
if (Ops[i].getNode()->hasOneUse() ||
|
||||
SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
|
||||
AllowVar = AllowVariableMask;
|
||||
if (SDValue Res = combineX86ShufflesRecursively(
|
||||
Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
|
||||
AllowVar, DAG, Subtarget))
|
||||
Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1,
|
||||
HasVariableMask, AllowVar, DAG, Subtarget))
|
||||
return Res;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2981,3 +2981,65 @@ define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa
|
|||
%7 = insertelement <8 x i16> %6, i16 %b15, i32 7
|
||||
ret <8 x i16> %7
|
||||
}
|
||||
|
||||
define void @PR43024() {
|
||||
; SSE2-LABEL: PR43024:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
|
||||
; SSE2-NEXT: movaps %xmm0, (%rax)
|
||||
; SSE2-NEXT: movaps %xmm0, %xmm1
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
|
||||
; SSE2-NEXT: addss %xmm0, %xmm1
|
||||
; SSE2-NEXT: xorps %xmm0, %xmm0
|
||||
; SSE2-NEXT: addss %xmm0, %xmm1
|
||||
; SSE2-NEXT: addss %xmm0, %xmm1
|
||||
; SSE2-NEXT: movss %xmm1, (%rax)
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: PR43024:
|
||||
; SSSE3: # %bb.0:
|
||||
; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
|
||||
; SSSE3-NEXT: movaps %xmm0, (%rax)
|
||||
; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; SSSE3-NEXT: addss %xmm0, %xmm1
|
||||
; SSSE3-NEXT: xorps %xmm0, %xmm0
|
||||
; SSSE3-NEXT: addss %xmm0, %xmm1
|
||||
; SSSE3-NEXT: addss %xmm0, %xmm1
|
||||
; SSSE3-NEXT: movss %xmm1, (%rax)
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: PR43024:
|
||||
; SSE41: # %bb.0:
|
||||
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
|
||||
; SSE41-NEXT: movaps %xmm0, (%rax)
|
||||
; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; SSE41-NEXT: addss %xmm0, %xmm1
|
||||
; SSE41-NEXT: xorps %xmm0, %xmm0
|
||||
; SSE41-NEXT: addss %xmm0, %xmm1
|
||||
; SSE41-NEXT: addss %xmm0, %xmm1
|
||||
; SSE41-NEXT: movss %xmm1, (%rax)
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: PR43024:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
|
||||
; AVX-NEXT: vmovaps %xmm0, (%rax)
|
||||
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vmovss %xmm0, (%rax)
|
||||
; AVX-NEXT: retq
|
||||
store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, <4 x float>* undef, align 16
|
||||
%1 = load <4 x float>, <4 x float>* undef, align 16
|
||||
%2 = fmul <4 x float> %1, <float 0x0, float 0x0, float 0x0, float 0x0>
|
||||
%3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
||||
%4 = fadd <4 x float> %2, %3
|
||||
%5 = fadd <4 x float> zeroinitializer, %4
|
||||
%6 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
|
||||
%7 = fadd <4 x float> %6, %5
|
||||
%8 = extractelement <4 x float> %7, i32 0
|
||||
store float %8, float* undef, align 8
|
||||
ret void
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue