forked from OSchip/llvm-project
				
			[X86][AVX] Add lowerVectorShuffleAsLanePermuteAndPermute for v4f64 shuffles (PR39161)
Add shuffle lowering for the case where we can shuffle the lanes into place followed by an in-lane permute. This is mainly for cases where we can have non-repeating permutes in each lane, but for now I've just enabled it for v4f64 unary shuffles to fix PR39161 - there is no test coverage for other shuffles that might benefit yet. We now have several cross-lane shuffle lowering methods that all do something similar - I've looked at merging some of these (notably by making the repeated mask mechanism in lowerVectorShuffleByMerging128BitLanes optional), but there is a lot of assertions/assumptions in the way that makes this tricky - I ended up going for adding yet another relatively simple method instead. Differential Revision: https://reviews.llvm.org/D53148 llvm-svn: 344446
This commit is contained in:
		
							parent
							
								
									af1f374ece
								
							
						
					
					
						commit
						f3952413f7
					
				| 
						 | 
				
			
			@ -13430,6 +13430,60 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
 | 
			
		|||
  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/// Lower a vector shuffle crossing multiple 128-bit lanes as
 | 
			
		||||
/// a lane permutation followed by a per-lane permutation.
 | 
			
		||||
///
 | 
			
		||||
/// This is mainly for cases where we can have non-repeating permutes
 | 
			
		||||
/// in each lane.
 | 
			
		||||
///
 | 
			
		||||
/// TODO: This is very similar to lowerVectorShuffleByMerging128BitLanes,
 | 
			
		||||
/// we should investigate merging them.
 | 
			
		||||
static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
 | 
			
		||||
    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
 | 
			
		||||
    SelectionDAG &DAG, const X86Subtarget &Subtarget) {
 | 
			
		||||
  int NumElts = VT.getVectorNumElements();
 | 
			
		||||
  int NumLanes = VT.getSizeInBits() / 128;
 | 
			
		||||
  int NumEltsPerLane = NumElts / NumLanes;
 | 
			
		||||
 | 
			
		||||
  SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
 | 
			
		||||
  SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
 | 
			
		||||
  SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i != NumElts; ++i) {
 | 
			
		||||
    int M = Mask[i];
 | 
			
		||||
    if (M < 0)
 | 
			
		||||
      continue;
 | 
			
		||||
 | 
			
		||||
    // Ensure that each lane comes from a single source lane.
 | 
			
		||||
    int SrcLane = M / NumEltsPerLane;
 | 
			
		||||
    int DstLane = i / NumEltsPerLane;
 | 
			
		||||
    if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
 | 
			
		||||
      return SDValue();
 | 
			
		||||
    SrcLaneMask[DstLane] = SrcLane;
 | 
			
		||||
 | 
			
		||||
    LaneMask[i] = (SrcLane * NumEltsPerLane) + (i % NumEltsPerLane);
 | 
			
		||||
    PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // If we're only shuffling a single lowest lane and the rest are identity
 | 
			
		||||
  // then don't bother.
 | 
			
		||||
  // TODO - isShuffleMaskInputInPlace could be extended to something like this.
 | 
			
		||||
  int NumIdentityLanes = 0;
 | 
			
		||||
  bool OnlyShuffleLowestLane = true;
 | 
			
		||||
  for (int i = 0; i != NumLanes; ++i) {
 | 
			
		||||
    if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
 | 
			
		||||
                                   i * NumEltsPerLane))
 | 
			
		||||
      NumIdentityLanes++;
 | 
			
		||||
    else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
 | 
			
		||||
      OnlyShuffleLowestLane = false;
 | 
			
		||||
  }
 | 
			
		||||
  if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
 | 
			
		||||
    return SDValue();
 | 
			
		||||
 | 
			
		||||
  SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
 | 
			
		||||
  return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/// Lower a vector shuffle crossing multiple 128-bit lanes as
 | 
			
		||||
/// a permutation and blend of those lanes.
 | 
			
		||||
///
 | 
			
		||||
| 
						 | 
				
			
			@ -14166,6 +14220,11 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 | 
			
		|||
            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
 | 
			
		||||
      return V;
 | 
			
		||||
 | 
			
		||||
    // Try to permute the lanes and then use a per-lane permute.
 | 
			
		||||
    if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
 | 
			
		||||
            DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget))
 | 
			
		||||
      return V;
 | 
			
		||||
 | 
			
		||||
    // Otherwise, fall back.
 | 
			
		||||
    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
 | 
			
		||||
                                                   DAG, Subtarget);
 | 
			
		||||
| 
						 | 
				
			
			@ -14200,6 +14259,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 | 
			
		|||
    if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
 | 
			
		||||
            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
 | 
			
		||||
      return Result;
 | 
			
		||||
 | 
			
		||||
  // If we have VLX support, we can use VEXPAND.
 | 
			
		||||
  if (Subtarget.hasVLX())
 | 
			
		||||
    if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -91,9 +91,8 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
 | 
			
		|||
define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
 | 
			
		||||
; AVX1-LABEL: shuffle_v4f64_1000:
 | 
			
		||||
; AVX1:       # %bb.0:
 | 
			
		||||
; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 | 
			
		||||
; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 | 
			
		||||
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 | 
			
		||||
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 | 
			
		||||
; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
 | 
			
		||||
; AVX1-NEXT:    retq
 | 
			
		||||
;
 | 
			
		||||
; AVX2-LABEL: shuffle_v4f64_1000:
 | 
			
		||||
| 
						 | 
				
			
			@ -174,10 +173,8 @@ define <4 x double> @shuffle_v4f64_2222_bc(<4 x i64> %a, <4 x i64> %b) {
 | 
			
		|||
define <4 x double> @shuffle_v4f64_2233(<4 x double> %a, <4 x double> %b) {
 | 
			
		||||
; AVX1-LABEL: shuffle_v4f64_2233:
 | 
			
		||||
; AVX1:       # %bb.0:
 | 
			
		||||
; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 | 
			
		||||
; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
 | 
			
		||||
; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
 | 
			
		||||
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 | 
			
		||||
; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
 | 
			
		||||
; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
 | 
			
		||||
; AVX1-NEXT:    retq
 | 
			
		||||
;
 | 
			
		||||
; AVX2-LABEL: shuffle_v4f64_2233:
 | 
			
		||||
| 
						 | 
				
			
			@ -766,9 +763,8 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
 | 
			
		|||
define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
 | 
			
		||||
; AVX1-LABEL: shuffle_v4i64_1000:
 | 
			
		||||
; AVX1:       # %bb.0:
 | 
			
		||||
; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
			
		||||
; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
 | 
			
		||||
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 | 
			
		||||
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 | 
			
		||||
; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
 | 
			
		||||
; AVX1-NEXT:    retq
 | 
			
		||||
;
 | 
			
		||||
; AVX2-LABEL: shuffle_v4i64_1000:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue