[X86][SSE] Added support for combining bit-shifts with shuffles.

Bit-shifts by a whole number of bytes can be represented as a shuffle mask suitable for combining.

Added a 'getFauxShuffleMask' function to allow us to create shuffle masks from other suitable operations.

llvm-svn: 288040
This commit is contained in:
Simon Pilgrim 2016-11-28 16:25:01 +00:00
parent 7fcacd8e0e
commit 3f10e66981
4 changed files with 99 additions and 64 deletions

View File

@ -5395,8 +5395,9 @@ static bool setTargetShuffleZeroElements(SDValue N,
bool IsUnary;
if (!isTargetShuffle(N.getOpcode()))
return false;
if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Ops,
Mask, IsUnary))
MVT VT = N.getSimpleValueType();
if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
return false;
SDValue V1 = Ops[0];
@ -5458,9 +5459,61 @@ static bool setTargetShuffleZeroElements(SDValue N,
}
}
assert(VT.getVectorNumElements() == Mask.size() &&
"Different mask size from vector size!");
return true;
}
// Attempt to decode ops that could be represented as a shuffle mask.
// The decoded shuffle mask may contain a different number of elements to the
// destination value type.
static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
SmallVectorImpl<SDValue> &Ops) {
Mask.clear();
Ops.clear();
MVT VT = N.getSimpleValueType();
unsigned NumElts = VT.getVectorNumElements();
unsigned Opcode = N.getOpcode();
switch (Opcode) {
case X86ISD::VSHLI:
case X86ISD::VSRLI: {
uint64_t ShiftVal = N.getConstantOperandVal(1);
// Out of range bit shifts are guaranteed to be zero.
if (VT.getScalarSizeInBits() <= ShiftVal) {
Mask.append(NumElts, SM_SentinelZero);
return true;
}
// We can only decode 'whole byte' bit shifts as shuffles.
if ((ShiftVal % 8) != 0)
break;
uint64_t ByteShift = ShiftVal / 8;
unsigned NumBytes = VT.getSizeInBits() / 8;
unsigned NumBytesPerElt = VT.getScalarSizeInBits() / 8;
Ops.push_back(N.getOperand(0));
// Clear mask to all zeros and insert the shifted byte indices.
Mask.append(NumBytes, SM_SentinelZero);
if (X86ISD::VSHLI == Opcode) {
for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
Mask[i + j] = i + j - ByteShift;
} else {
for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
Mask[i + j - ByteShift] = i + j;
}
return true;
}
}
return false;
}
/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
/// remaining input indices in case we now have a unary shuffle and adjust the
@ -5470,7 +5523,8 @@ static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
SmallVectorImpl<int> &Mask) {
SmallVector<SDValue, 2> Ops;
if (!setTargetShuffleZeroElements(Op, Mask, Ops))
return false;
if (!getFauxShuffleMask(Op, Mask, Ops))
return false;
int NumElts = Mask.size();
bool Op0InUse = any_of(Mask, [NumElts](int Idx) {
@ -26299,8 +26353,6 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
Ops.push_back(Input1);
}
assert(VT.getVectorNumElements() == OpMask.size() &&
"Different mask size from vector size!");
assert(((RootMask.size() > OpMask.size() &&
RootMask.size() % OpMask.size() == 0) ||
(OpMask.size() > RootMask.size() &&

View File

@ -442,20 +442,18 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp
;
; SSSE3-LABEL: sext_16i8_to_4i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: psrad $24, %xmm2
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSSE3-NEXT: psrld $16, %xmm0
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: psrad $31, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSSE3-NEXT: psrad $31, %xmm2
; SSSE3-NEXT: psrad $24, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,u,u,u,3,u,u,u],zero,xmm1[u,u,u],zero
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: psrad $31, %xmm2
; SSSE3-NEXT: psrad $24, %xmm1
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i8_to_4i64:
@ -532,34 +530,31 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
;
; SSSE3-LABEL: sext_16i8_to_8i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <u,u,u,2,u,u,u,3,u,u,u,255,u,u,u,255>
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSSE3-NEXT: psrad $31, %xmm2
; SSSE3-NEXT: psrad $24, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
; SSSE3-NEXT: psrld $16, %xmm1
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: psrad $31, %xmm2
; SSSE3-NEXT: pshufb %xmm2, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: psrad $31, %xmm0
; SSSE3-NEXT: psrad $24, %xmm1
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
; SSSE3-NEXT: movdqa %xmm0, %xmm4
; SSSE3-NEXT: psrad $31, %xmm4
; SSSE3-NEXT: psrad $24, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; SSSE3-NEXT: pshufb %xmm2, %xmm3
; SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSSE3-NEXT: psrad $31, %xmm2
; SSSE3-NEXT: psrad $24, %xmm3
; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; SSSE3-NEXT: movdqa %xmm2, %xmm4
; SSSE3-NEXT: psrad $31, %xmm4
; SSSE3-NEXT: psrad $24, %xmm2
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSSE3-NEXT: psrld $16, %xmm3
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: movdqa %xmm3, %xmm4
; SSSE3-NEXT: psrad $31, %xmm4
; SSSE3-NEXT: psrad $24, %xmm3
; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i8_to_8i64:

View File

@ -599,14 +599,12 @@ define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) {
; X32-LABEL: combine_psrlw_pshufb:
; X32: # BB#0:
; X32-NEXT: vpsrlw $8, %ymm0, %ymm0
; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
; X32-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[1],zero,ymm0[3],zero,ymm0[5],zero,ymm0[7],zero,ymm0[9],zero,ymm0[11],zero,ymm0[13],zero,ymm0[15],zero,ymm0[17],zero,ymm0[19],zero,ymm0[21],zero,ymm0[23],zero,ymm0[25],zero,ymm0[27],zero,ymm0[29],zero,ymm0[31]
; X32-NEXT: retl
;
; X64-LABEL: combine_psrlw_pshufb:
; X64: # BB#0:
; X64-NEXT: vpsrlw $8, %ymm0, %ymm0
; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
; X64-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[1],zero,ymm0[3],zero,ymm0[5],zero,ymm0[7],zero,ymm0[9],zero,ymm0[11],zero,ymm0[13],zero,ymm0[15],zero,ymm0[17],zero,ymm0[19],zero,ymm0[21],zero,ymm0[23],zero,ymm0[25],zero,ymm0[27],zero,ymm0[29],zero,ymm0[31]
; X64-NEXT: retq
%1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%2 = bitcast <16 x i16> %1 to <32 x i8>
@ -617,14 +615,12 @@ define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) {
define <32 x i8> @combine_pslld_pshufb(<8 x i32> %a0) {
; X32-LABEL: combine_pslld_pshufb:
; X32: # BB#0:
; X32-NEXT: vpslld $24, %ymm0, %ymm0
; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[4],zero,zero,zero,ymm0[8],zero,zero,zero,ymm0[12],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[20],zero,zero,zero,ymm0[24],zero,zero,zero,ymm0[28],zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: combine_pslld_pshufb:
; X64: # BB#0:
; X64-NEXT: vpslld $24, %ymm0, %ymm0
; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[4],zero,zero,zero,ymm0[8],zero,zero,zero,ymm0[12],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[20],zero,zero,zero,ymm0[24],zero,zero,zero,ymm0[28],zero,zero,zero
; X64-NEXT: retq
%1 = shl <8 x i32> %a0, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
%2 = bitcast <8 x i32> %1 to <32 x i8>
@ -635,14 +631,12 @@ define <32 x i8> @combine_pslld_pshufb(<8 x i32> %a0) {
define <32 x i8> @combine_psrlq_pshufb(<4 x i64> %a0) {
; X32-LABEL: combine_psrlq_pshufb:
; X32: # BB#0:
; X32-NEXT: vpsrlq $32, %ymm0, %ymm0
; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,31,30,29,28,27,26,25,24,23]
; X32-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero
; X32-NEXT: retl
;
; X64-LABEL: combine_psrlq_pshufb:
; X64: # BB#0:
; X64-NEXT: vpsrlq $32, %ymm0, %ymm0
; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,31,30,29,28,27,26,25,24,23]
; X64-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero
; X64-NEXT: retq
%1 = lshr <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
%2 = bitcast <4 x i64> %1 to <32 x i8>

View File

@ -412,14 +412,12 @@ define <16 x i8> @combine_pshufb_as_unary_unpckhwd(<16 x i8> %a0) {
define <16 x i8> @combine_psrlw_pshufb(<8 x i16> %a0) {
; SSE-LABEL: combine_psrlw_pshufb:
; SSE: # BB#0:
; SSE-NEXT: psrlw $8, %xmm0
; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: combine_psrlw_pshufb:
; AVX: # BB#0:
; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: retq
%1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%2 = bitcast <8 x i16> %1 to <16 x i8>
@ -430,14 +428,12 @@ define <16 x i8> @combine_psrlw_pshufb(<8 x i16> %a0) {
define <16 x i8> @combine_pslld_pshufb(<4 x i32> %a0) {
; SSE-LABEL: combine_pslld_pshufb:
; SSE: # BB#0:
; SSE-NEXT: pslld $8, %xmm0
; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,1,0],zero,xmm0[6,5,4],zero,xmm0[10,9,8],zero,xmm0[14,13,12],zero
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pslld_pshufb:
; AVX: # BB#0:
; AVX-NEXT: vpslld $8, %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,1,0],zero,xmm0[6,5,4],zero,xmm0[10,9,8],zero,xmm0[14,13,12],zero
; AVX-NEXT: retq
%1 = shl <4 x i32> %a0, <i32 8, i32 8, i32 8, i32 8>
%2 = bitcast <4 x i32> %1 to <16 x i8>
@ -448,14 +444,12 @@ define <16 x i8> @combine_pslld_pshufb(<4 x i32> %a0) {
define <16 x i8> @combine_psrlq_pshufb(<2 x i64> %a0) {
; SSE-LABEL: combine_psrlq_pshufb:
; SSE: # BB#0:
; SSE-NEXT: psrlq $48, %xmm0
; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[7,6],zero,zero,zero,zero,zero,zero,xmm0[15,14]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_psrlq_pshufb:
; AVX: # BB#0:
; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[7,6],zero,zero,zero,zero,zero,zero,xmm0[15,14]
; AVX-NEXT: retq
%1 = lshr <2 x i64> %a0, <i64 48, i64 48>
%2 = bitcast <2 x i64> %1 to <16 x i8>