forked from OSchip/llvm-project
				
			[X86][SSE] Add support for combining target shuffles to PALIGNR byte rotations
llvm-svn: 278787
This commit is contained in:
		
							parent
							
								
									5980232178
								
							
						
					
					
						commit
						f16cd361d4
					
				| 
						 | 
				
			
			@ -7747,13 +7747,8 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
 | 
			
		|||
/// elements, and takes the low elements as the result. Note that while this is
 | 
			
		||||
/// specified as a *right shift* because x86 is little-endian, it is a *left
 | 
			
		||||
/// rotate* of the vector lanes.
 | 
			
		||||
static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
 | 
			
		||||
                                              SDValue V1, SDValue V2,
 | 
			
		||||
                                              ArrayRef<int> Mask,
 | 
			
		||||
                                              const X86Subtarget &Subtarget,
 | 
			
		||||
                                              SelectionDAG &DAG) {
 | 
			
		||||
  assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
 | 
			
		||||
 | 
			
		||||
static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
 | 
			
		||||
                                          ArrayRef<int> Mask) {
 | 
			
		||||
  int NumElts = Mask.size();
 | 
			
		||||
  int NumLanes = VT.getSizeInBits() / 128;
 | 
			
		||||
  int NumLaneElts = NumElts / NumLanes;
 | 
			
		||||
| 
						 | 
				
			
			@ -7769,20 +7764,28 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
 | 
			
		|||
  SDValue Lo, Hi;
 | 
			
		||||
  for (int l = 0; l < NumElts; l += NumLaneElts) {
 | 
			
		||||
    for (int i = 0; i < NumLaneElts; ++i) {
 | 
			
		||||
      if (Mask[l + i] < 0)
 | 
			
		||||
      int M = Mask[l + i];
 | 
			
		||||
 | 
			
		||||
      if (M == SM_SentinelUndef)
 | 
			
		||||
        continue;
 | 
			
		||||
 | 
			
		||||
      if (M == SM_SentinelZero)
 | 
			
		||||
        return -1;
 | 
			
		||||
 | 
			
		||||
      assert(0 <= M && M < (2*NumElts) && "Unexpected mask index.");
 | 
			
		||||
 | 
			
		||||
      // Get the mod-Size index and lane correct it.
 | 
			
		||||
      int LaneIdx = (Mask[l + i] % NumElts) - l;
 | 
			
		||||
      int LaneIdx = (M % NumElts) - l;
 | 
			
		||||
 | 
			
		||||
      // Make sure it was in this lane.
 | 
			
		||||
      if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
 | 
			
		||||
        return SDValue();
 | 
			
		||||
        return -1;
 | 
			
		||||
 | 
			
		||||
      // Determine where a rotated vector would have started.
 | 
			
		||||
      int StartIdx = i - LaneIdx;
 | 
			
		||||
      if (StartIdx == 0)
 | 
			
		||||
        // The identity rotation isn't interesting, stop.
 | 
			
		||||
        return SDValue();
 | 
			
		||||
        return -1;
 | 
			
		||||
 | 
			
		||||
      // If we found the tail of a vector the rotation must be the missing
 | 
			
		||||
      // front. If we found the head of a vector, it must be how much of the
 | 
			
		||||
| 
						 | 
				
			
			@ -7793,10 +7796,10 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
 | 
			
		|||
        Rotation = CandidateRotation;
 | 
			
		||||
      else if (Rotation != CandidateRotation)
 | 
			
		||||
        // The rotations don't match, so we can't match this mask.
 | 
			
		||||
        return SDValue();
 | 
			
		||||
        return -1;
 | 
			
		||||
 | 
			
		||||
      // Compute which value this mask is pointing at.
 | 
			
		||||
      SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
 | 
			
		||||
      SDValue MaskV = M < NumElts ? V1 : V2;
 | 
			
		||||
 | 
			
		||||
      // Compute which of the two target values this index should be assigned
 | 
			
		||||
      // to. This reflects whether the high elements are remaining or the low
 | 
			
		||||
| 
						 | 
				
			
			@ -7810,7 +7813,7 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
 | 
			
		|||
      else if (TargetV != MaskV)
 | 
			
		||||
        // This may be a rotation, but it pulls from the inputs in some
 | 
			
		||||
        // unsupported interleaving.
 | 
			
		||||
        return SDValue();
 | 
			
		||||
        return -1;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -7822,15 +7825,32 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
 | 
			
		|||
  else if (!Hi)
 | 
			
		||||
    Hi = Lo;
 | 
			
		||||
 | 
			
		||||
  // Cast the inputs to i8 vector of correct length to match PALIGNR or
 | 
			
		||||
  // PSLLDQ/PSRLDQ.
 | 
			
		||||
  MVT ByteVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
 | 
			
		||||
  Lo = DAG.getBitcast(ByteVT, Lo);
 | 
			
		||||
  Hi = DAG.getBitcast(ByteVT, Hi);
 | 
			
		||||
  V1 = Lo;
 | 
			
		||||
  V2 = Hi;
 | 
			
		||||
 | 
			
		||||
  // The actual rotate instruction rotates bytes, so we need to scale the
 | 
			
		||||
  // rotation based on how many bytes are in the vector lane.
 | 
			
		||||
  int Scale = 16 / NumLaneElts;
 | 
			
		||||
  return Rotation * Scale;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
 | 
			
		||||
                                              SDValue V1, SDValue V2,
 | 
			
		||||
                                              ArrayRef<int> Mask,
 | 
			
		||||
                                              const X86Subtarget &Subtarget,
 | 
			
		||||
                                              SelectionDAG &DAG) {
 | 
			
		||||
  assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
 | 
			
		||||
 | 
			
		||||
  SDValue Lo = V1, Hi = V2;
 | 
			
		||||
  int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
 | 
			
		||||
  if (ByteRotation <= 0)
 | 
			
		||||
    return SDValue();
 | 
			
		||||
 | 
			
		||||
  // Cast the inputs to i8 vector of correct length to match PALIGNR or
 | 
			
		||||
  // PSLLDQ/PSRLDQ.
 | 
			
		||||
  MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
 | 
			
		||||
  Lo = DAG.getBitcast(ByteVT, Lo);
 | 
			
		||||
  Hi = DAG.getBitcast(ByteVT, Hi);
 | 
			
		||||
 | 
			
		||||
  // SSSE3 targets can use the palignr instruction.
 | 
			
		||||
  if (Subtarget.hasSSSE3()) {
 | 
			
		||||
| 
						 | 
				
			
			@ -7838,7 +7858,7 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
 | 
			
		|||
           "512-bit PALIGNR requires BWI instructions");
 | 
			
		||||
    return DAG.getBitcast(
 | 
			
		||||
        VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
 | 
			
		||||
                        DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
 | 
			
		||||
                        DAG.getConstant(ByteRotation, DL, MVT::i8)));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  assert(VT.is128BitVector() &&
 | 
			
		||||
| 
						 | 
				
			
			@ -7849,8 +7869,8 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
 | 
			
		|||
         "SSE2 rotate lowering only needed for v16i8!");
 | 
			
		||||
 | 
			
		||||
  // Default SSE2 implementation
 | 
			
		||||
  int LoByteShift = 16 - Rotation * Scale;
 | 
			
		||||
  int HiByteShift = Rotation * Scale;
 | 
			
		||||
  int LoByteShift = 16 - ByteRotation;
 | 
			
		||||
  int HiByteShift = ByteRotation;
 | 
			
		||||
 | 
			
		||||
  SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
 | 
			
		||||
                                DAG.getConstant(LoByteShift, DL, MVT::i8));
 | 
			
		||||
| 
						 | 
				
			
			@ -25198,6 +25218,19 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 | 
			
		|||
                                            unsigned &Shuffle, MVT &ShuffleVT,
 | 
			
		||||
                                            unsigned &PermuteImm) {
 | 
			
		||||
  unsigned NumMaskElts = Mask.size();
 | 
			
		||||
  bool FloatDomain = MaskVT.isFloatingPoint();
 | 
			
		||||
 | 
			
		||||
  // Attempt to match against PALIGNR byte rotate.
 | 
			
		||||
  if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
 | 
			
		||||
                       (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
 | 
			
		||||
    int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
 | 
			
		||||
    if (0 < ByteRotation) {
 | 
			
		||||
      Shuffle = X86ISD::PALIGNR;
 | 
			
		||||
      ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
 | 
			
		||||
      PermuteImm = ByteRotation;
 | 
			
		||||
      return true;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Attempt to blend with zero.
 | 
			
		||||
  if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -2971,9 +2971,7 @@ define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_1
 | 
			
		|||
; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 | 
			
		||||
; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9]
 | 
			
		||||
; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
 | 
			
		||||
; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
 | 
			
		||||
; AVX1-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 | 
			
		||||
; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6,7]
 | 
			
		||||
; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
 | 
			
		||||
; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 | 
			
		||||
; AVX1-NEXT:    retq
 | 
			
		||||
;
 | 
			
		||||
| 
						 | 
				
			
			@ -3215,9 +3213,7 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2
 | 
			
		|||
; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 | 
			
		||||
; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9]
 | 
			
		||||
; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
 | 
			
		||||
; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
 | 
			
		||||
; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 | 
			
		||||
; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
 | 
			
		||||
; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
 | 
			
		||||
; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 | 
			
		||||
; AVX1-NEXT:    retq
 | 
			
		||||
;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -576,7 +576,7 @@ define <4 x i64> @shuffle_v4i64_0112(<4 x i64> %a, <4 x i64> %b) {
 | 
			
		|||
; AVX1-LABEL: shuffle_v4i64_0112:
 | 
			
		||||
; AVX1:       # BB#0:
 | 
			
		||||
; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 | 
			
		||||
; AVX1-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
 | 
			
		||||
; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 | 
			
		||||
; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 | 
			
		||||
; AVX1-NEXT:    retq
 | 
			
		||||
;
 | 
			
		||||
| 
						 | 
				
			
			@ -748,7 +748,7 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
 | 
			
		|||
; AVX1-LABEL: shuffle_v4i64_0412:
 | 
			
		||||
; AVX1:       # BB#0:
 | 
			
		||||
; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 | 
			
		||||
; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
 | 
			
		||||
; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 | 
			
		||||
; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 | 
			
		||||
; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
 | 
			
		||||
; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
 | 
			
		||||
| 
						 | 
				
			
			@ -775,7 +775,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
 | 
			
		|||
; AVX1-LABEL: shuffle_v4i64_4012:
 | 
			
		||||
; AVX1:       # BB#0:
 | 
			
		||||
; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 | 
			
		||||
; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
 | 
			
		||||
; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 | 
			
		||||
; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 | 
			
		||||
; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 | 
			
		||||
; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -184,6 +184,20 @@ define <16 x i8> @combine_pshufb_psrldq(<16 x i8> %a0) {
 | 
			
		|||
  ret <16 x i8> %2
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
define <16 x i8> @combine_pshufb_as_palignr(<16 x i8> %a0) {
 | 
			
		||||
; SSE-LABEL: combine_pshufb_as_palignr:
 | 
			
		||||
; SSE:       # BB#0:
 | 
			
		||||
; SSE-NEXT:    palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
 | 
			
		||||
; SSE-NEXT:    retq
 | 
			
		||||
;
 | 
			
		||||
; AVX-LABEL: combine_pshufb_as_palignr:
 | 
			
		||||
; AVX:       # BB#0:
 | 
			
		||||
; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
 | 
			
		||||
; AVX-NEXT:    retq
 | 
			
		||||
  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 undef, i8 undef, i8 0>)
 | 
			
		||||
  ret <16 x i8> %res0
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
define <16 x i8> @combine_pshufb_as_pslldq(<16 x i8> %a0) {
 | 
			
		||||
; SSE-LABEL: combine_pshufb_as_pslldq:
 | 
			
		||||
; SSE:       # BB#0:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue