[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary: On behalf of julia.koval@intel.com The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints). umax(a,b) - b -> subus(a,b) a - umin(a,b) -> subus(a,b) There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987). The example of special case code: ``` void foo(unsigned short *p, int max, int n) { int i; unsigned m; for (i = 0; i < n; i++) { m = *--p; *p = (unsigned short)(m >= max ? m-max : 0); } } ``` Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero. Here is the table of types, I try to support, special case items are bold: | Size | 128 | 256 | 512 | ----- | ----- | ----- | ----- | i8 | v16i8 | v32i8 | v64i8 | i16 | v8i16 | v16i16 | v32i16 | i32 | | **v8i32** | **v16i32** | i64 | | | **v8i64** Reviewers: zvi, spatel, DavidKreitzer, RKSimon Reviewed By: zvi Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D37534 llvm-svn: 315237
This commit is contained in:
		
							parent
							
								
									663ba15ed6
								
							
						
					
					
						commit
						c1d5955684
					
				| 
						 | 
					@ -35901,6 +35901,89 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
 | 
				
			||||||
  return combineAddOrSubToADCOrSBB(N, DAG);
 | 
					  return combineAddOrSubToADCOrSBB(N, DAG);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
 | 
				
			||||||
 | 
					                                 const X86Subtarget &Subtarget) {
 | 
				
			||||||
 | 
					  SDValue Op0 = N->getOperand(0);
 | 
				
			||||||
 | 
					  SDValue Op1 = N->getOperand(1);
 | 
				
			||||||
 | 
					  EVT VT = N->getValueType(0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // PSUBUS is supported, starting from SSE2, but special preprocessing
 | 
				
			||||||
 | 
					  // for v8i32 requires umin, which appears in SSE41.
 | 
				
			||||||
 | 
					  if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
 | 
				
			||||||
 | 
					      !(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&
 | 
				
			||||||
 | 
					      !(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
 | 
				
			||||||
 | 
					      !(Subtarget.hasAVX512() && Subtarget.hasBWI() &&
 | 
				
			||||||
 | 
					        (VT == MVT::v64i8 || VT == MVT::v32i16 || VT == MVT::v16i32 ||
 | 
				
			||||||
 | 
					         VT == MVT::v8i64)))
 | 
				
			||||||
 | 
					    return SDValue();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  SDValue SubusLHS, SubusRHS;
 | 
				
			||||||
 | 
					  // Try to find umax(a,b) - b or a - umin(a,b) patterns
 | 
				
			||||||
 | 
					  // they may be converted to subus(a,b).
 | 
				
			||||||
 | 
					  // TODO: Need to add IR cannonicialization for this code.
 | 
				
			||||||
 | 
					  if (Op0.getOpcode() == ISD::UMAX) {
 | 
				
			||||||
 | 
					    SubusRHS = Op1;
 | 
				
			||||||
 | 
					    SDValue MaxLHS = Op0.getOperand(0);
 | 
				
			||||||
 | 
					    SDValue MaxRHS = Op0.getOperand(1);
 | 
				
			||||||
 | 
					    if (DAG.isEqualTo(MaxLHS, Op1))
 | 
				
			||||||
 | 
					      SubusLHS = MaxRHS;
 | 
				
			||||||
 | 
					    else if (DAG.isEqualTo(MaxRHS, Op1))
 | 
				
			||||||
 | 
					      SubusLHS = MaxLHS;
 | 
				
			||||||
 | 
					    else
 | 
				
			||||||
 | 
					      return SDValue();
 | 
				
			||||||
 | 
					  } else if (Op1.getOpcode() == ISD::UMIN) {
 | 
				
			||||||
 | 
					    SubusLHS = Op0;
 | 
				
			||||||
 | 
					    SDValue MinLHS = Op1.getOperand(0);
 | 
				
			||||||
 | 
					    SDValue MinRHS = Op1.getOperand(1);
 | 
				
			||||||
 | 
					    if (DAG.isEqualTo(MinLHS, Op0))
 | 
				
			||||||
 | 
					      SubusRHS = MinRHS;
 | 
				
			||||||
 | 
					    else if (DAG.isEqualTo(MinRHS, Op0))
 | 
				
			||||||
 | 
					      SubusRHS = MinLHS;
 | 
				
			||||||
 | 
					    else
 | 
				
			||||||
 | 
					      return SDValue();
 | 
				
			||||||
 | 
					  } else
 | 
				
			||||||
 | 
					    return SDValue();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
 | 
				
			||||||
 | 
					  // special preprocessing in some cases.
 | 
				
			||||||
 | 
					  if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
 | 
				
			||||||
 | 
					    return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Special preprocessing case can be only applied
 | 
				
			||||||
 | 
					  // if the value was zero extended from 16 bit,
 | 
				
			||||||
 | 
					  // so we require first 16 bits to be zeros for 32 bit
 | 
				
			||||||
 | 
					  // values, or first 48 bits for 64 bit values.
 | 
				
			||||||
 | 
					  KnownBits Known;
 | 
				
			||||||
 | 
					  DAG.computeKnownBits(SubusLHS, Known);
 | 
				
			||||||
 | 
					  unsigned NumZeros = Known.countMinLeadingZeros();
 | 
				
			||||||
 | 
					  if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
 | 
				
			||||||
 | 
					    return SDValue();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  EVT ExtType = SubusLHS.getValueType();
 | 
				
			||||||
 | 
					  EVT ShrinkedType;
 | 
				
			||||||
 | 
					  if (VT == MVT::v8i32 || VT == MVT::v8i64)
 | 
				
			||||||
 | 
					    ShrinkedType = MVT::v8i16;
 | 
				
			||||||
 | 
					  else
 | 
				
			||||||
 | 
					    ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // If SubusLHS is zeroextended - truncate SubusRHS to it's
 | 
				
			||||||
 | 
					  // size SubusRHS = umin(0xFFF.., SubusRHS).
 | 
				
			||||||
 | 
					  SDValue SaturationConst =
 | 
				
			||||||
 | 
					      DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
 | 
				
			||||||
 | 
					                                           ShrinkedType.getScalarSizeInBits()),
 | 
				
			||||||
 | 
					                      SDLoc(SubusLHS), ExtType);
 | 
				
			||||||
 | 
					  SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
 | 
				
			||||||
 | 
					                             SaturationConst);
 | 
				
			||||||
 | 
					  SDValue NewSubusLHS =
 | 
				
			||||||
 | 
					      DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
 | 
				
			||||||
 | 
					  SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
 | 
				
			||||||
 | 
					  SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType,
 | 
				
			||||||
 | 
					                               NewSubusLHS, NewSubusRHS);
 | 
				
			||||||
 | 
					  // Zero extend the result, it may be used somewhere as 32 bit,
 | 
				
			||||||
 | 
					  // if not zext and following trunc will shrink.
 | 
				
			||||||
 | 
					  return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
 | 
					static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
 | 
				
			||||||
                          const X86Subtarget &Subtarget) {
 | 
					                          const X86Subtarget &Subtarget) {
 | 
				
			||||||
  SDValue Op0 = N->getOperand(0);
 | 
					  SDValue Op0 = N->getOperand(0);
 | 
				
			||||||
| 
						 | 
					@ -35934,6 +36017,10 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
 | 
				
			||||||
  if (SDValue V = combineIncDecVector(N, DAG))
 | 
					  if (SDValue V = combineIncDecVector(N, DAG))
 | 
				
			||||||
    return V;
 | 
					    return V;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Try to create PSUBUS if SUB's argument is max/min
 | 
				
			||||||
 | 
					  if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
 | 
				
			||||||
 | 
					    return V;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  return combineAddOrSubToADCOrSBB(N, DAG);
 | 
					  return combineAddOrSubToADCOrSBB(N, DAG);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1143,20 +1143,17 @@ define <8 x i16> @psubus_8i16_max(<8 x i16> %x, <8 x i16> %y) nounwind {
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; SSE41-LABEL: psubus_8i16_max:
 | 
					; SSE41-LABEL: psubus_8i16_max:
 | 
				
			||||||
; SSE41:       # BB#0: # %vector.ph
 | 
					; SSE41:       # BB#0: # %vector.ph
 | 
				
			||||||
; SSE41-NEXT:    pmaxuw %xmm1, %xmm0
 | 
					; SSE41-NEXT:    psubusw %xmm1, %xmm0
 | 
				
			||||||
; SSE41-NEXT:    psubw %xmm1, %xmm0
 | 
					 | 
				
			||||||
; SSE41-NEXT:    retq
 | 
					; SSE41-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX-LABEL: psubus_8i16_max:
 | 
					; AVX-LABEL: psubus_8i16_max:
 | 
				
			||||||
; AVX:       # BB#0: # %vector.ph
 | 
					; AVX:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
 | 
					; AVX-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
 | 
				
			||||||
; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
 | 
					 | 
				
			||||||
; AVX-NEXT:    retq
 | 
					; AVX-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512-LABEL: psubus_8i16_max:
 | 
					; AVX512-LABEL: psubus_8i16_max:
 | 
				
			||||||
; AVX512:       # BB#0: # %vector.ph
 | 
					; AVX512:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX512-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
 | 
					; AVX512-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
 | 
				
			||||||
; AVX512-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
 | 
					 | 
				
			||||||
; AVX512-NEXT:    retq
 | 
					; AVX512-NEXT:    retq
 | 
				
			||||||
vector.ph:
 | 
					vector.ph:
 | 
				
			||||||
  %cmp = icmp ult <8 x i16> %x, %y
 | 
					  %cmp = icmp ult <8 x i16> %x, %y
 | 
				
			||||||
| 
						 | 
					@ -1168,20 +1165,17 @@ vector.ph:
 | 
				
			||||||
define <16 x i8> @psubus_16i8_max(<16 x i8> %x, <16 x i8> %y) nounwind {
 | 
					define <16 x i8> @psubus_16i8_max(<16 x i8> %x, <16 x i8> %y) nounwind {
 | 
				
			||||||
; SSE-LABEL: psubus_16i8_max:
 | 
					; SSE-LABEL: psubus_16i8_max:
 | 
				
			||||||
; SSE:       # BB#0: # %vector.ph
 | 
					; SSE:       # BB#0: # %vector.ph
 | 
				
			||||||
; SSE-NEXT:    pmaxub %xmm1, %xmm0
 | 
					; SSE-NEXT:    psubusb %xmm1, %xmm0
 | 
				
			||||||
; SSE-NEXT:    psubb %xmm1, %xmm0
 | 
					 | 
				
			||||||
; SSE-NEXT:    retq
 | 
					; SSE-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX-LABEL: psubus_16i8_max:
 | 
					; AVX-LABEL: psubus_16i8_max:
 | 
				
			||||||
; AVX:       # BB#0: # %vector.ph
 | 
					; AVX:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
 | 
					; AVX-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
 | 
				
			||||||
; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 | 
					 | 
				
			||||||
; AVX-NEXT:    retq
 | 
					; AVX-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512-LABEL: psubus_16i8_max:
 | 
					; AVX512-LABEL: psubus_16i8_max:
 | 
				
			||||||
; AVX512:       # BB#0: # %vector.ph
 | 
					; AVX512:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX512-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
 | 
					; AVX512-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
 | 
				
			||||||
; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 | 
					 | 
				
			||||||
; AVX512-NEXT:    retq
 | 
					; AVX512-NEXT:    retq
 | 
				
			||||||
vector.ph:
 | 
					vector.ph:
 | 
				
			||||||
  %cmp = icmp ult <16 x i8> %x, %y
 | 
					  %cmp = icmp ult <16 x i8> %x, %y
 | 
				
			||||||
| 
						 | 
					@ -1245,33 +1239,27 @@ define <16 x i16> @psubus_16i16_max(<16 x i16> %x, <16 x i16> %y) nounwind {
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; SSE41-LABEL: psubus_16i16_max:
 | 
					; SSE41-LABEL: psubus_16i16_max:
 | 
				
			||||||
; SSE41:       # BB#0: # %vector.ph
 | 
					; SSE41:       # BB#0: # %vector.ph
 | 
				
			||||||
; SSE41-NEXT:    pmaxuw %xmm3, %xmm1
 | 
					; SSE41-NEXT:    psubusw %xmm2, %xmm0
 | 
				
			||||||
; SSE41-NEXT:    pmaxuw %xmm2, %xmm0
 | 
					; SSE41-NEXT:    psubusw %xmm3, %xmm1
 | 
				
			||||||
; SSE41-NEXT:    psubw %xmm2, %xmm0
 | 
					 | 
				
			||||||
; SSE41-NEXT:    psubw %xmm3, %xmm1
 | 
					 | 
				
			||||||
; SSE41-NEXT:    retq
 | 
					; SSE41-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX1-LABEL: psubus_16i16_max:
 | 
					; AVX1-LABEL: psubus_16i16_max:
 | 
				
			||||||
; AVX1:       # BB#0: # %vector.ph
 | 
					; AVX1:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm2
 | 
					; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 | 
				
			||||||
; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 | 
					; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 | 
				
			||||||
; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 | 
					; AVX1-NEXT:    vpsubusw %xmm2, %xmm3, %xmm2
 | 
				
			||||||
; AVX1-NEXT:    vpmaxuw %xmm3, %xmm0, %xmm0
 | 
					; AVX1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
 | 
				
			||||||
; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
 | 
					; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 | 
					 | 
				
			||||||
; AVX1-NEXT:    retq
 | 
					; AVX1-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2-LABEL: psubus_16i16_max:
 | 
					; AVX2-LABEL: psubus_16i16_max:
 | 
				
			||||||
; AVX2:       # BB#0: # %vector.ph
 | 
					; AVX2:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
 | 
					 | 
				
			||||||
; AVX2-NEXT:    retq
 | 
					; AVX2-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512-LABEL: psubus_16i16_max:
 | 
					; AVX512-LABEL: psubus_16i16_max:
 | 
				
			||||||
; AVX512:       # BB#0: # %vector.ph
 | 
					; AVX512:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
 | 
					; AVX512-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
 | 
					 | 
				
			||||||
; AVX512-NEXT:    retq
 | 
					; AVX512-NEXT:    retq
 | 
				
			||||||
vector.ph:
 | 
					vector.ph:
 | 
				
			||||||
  %cmp = icmp ult <16 x i16> %x, %y
 | 
					  %cmp = icmp ult <16 x i16> %x, %y
 | 
				
			||||||
| 
						 | 
					@ -1379,46 +1367,35 @@ define <32 x i16> @psubus_32i16_max(<32 x i16> %x, <32 x i16> %y) nounwind {
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; SSE41-LABEL: psubus_32i16_max:
 | 
					; SSE41-LABEL: psubus_32i16_max:
 | 
				
			||||||
; SSE41:       # BB#0: # %vector.ph
 | 
					; SSE41:       # BB#0: # %vector.ph
 | 
				
			||||||
; SSE41-NEXT:    pmaxuw %xmm7, %xmm3
 | 
					; SSE41-NEXT:    psubusw %xmm4, %xmm0
 | 
				
			||||||
; SSE41-NEXT:    pmaxuw %xmm6, %xmm2
 | 
					; SSE41-NEXT:    psubusw %xmm5, %xmm1
 | 
				
			||||||
; SSE41-NEXT:    pmaxuw %xmm5, %xmm1
 | 
					; SSE41-NEXT:    psubusw %xmm6, %xmm2
 | 
				
			||||||
; SSE41-NEXT:    pmaxuw %xmm4, %xmm0
 | 
					; SSE41-NEXT:    psubusw %xmm7, %xmm3
 | 
				
			||||||
; SSE41-NEXT:    psubw %xmm4, %xmm0
 | 
					 | 
				
			||||||
; SSE41-NEXT:    psubw %xmm5, %xmm1
 | 
					 | 
				
			||||||
; SSE41-NEXT:    psubw %xmm6, %xmm2
 | 
					 | 
				
			||||||
; SSE41-NEXT:    psubw %xmm7, %xmm3
 | 
					 | 
				
			||||||
; SSE41-NEXT:    retq
 | 
					; SSE41-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX1-LABEL: psubus_32i16_max:
 | 
					; AVX1-LABEL: psubus_32i16_max:
 | 
				
			||||||
; AVX1:       # BB#0: # %vector.ph
 | 
					; AVX1:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX1-NEXT:    vpmaxuw %xmm3, %xmm1, %xmm4
 | 
					; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
 | 
				
			||||||
; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
 | 
					; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 | 
				
			||||||
; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 | 
					; AVX1-NEXT:    vpsubusw %xmm4, %xmm5, %xmm4
 | 
				
			||||||
; AVX1-NEXT:    vpmaxuw %xmm5, %xmm1, %xmm1
 | 
					; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
 | 
				
			||||||
; AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm6
 | 
					; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
 | 
				
			||||||
; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
 | 
					; AVX1-NEXT:    vpsubusw %xmm5, %xmm6, %xmm5
 | 
				
			||||||
; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 | 
					; AVX1-NEXT:    vpsubusw %xmm2, %xmm0, %xmm0
 | 
				
			||||||
; AVX1-NEXT:    vpmaxuw %xmm7, %xmm0, %xmm0
 | 
					; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
 | 
				
			||||||
; AVX1-NEXT:    vpsubw %xmm7, %xmm0, %xmm0
 | 
					; AVX1-NEXT:    vpsubusw %xmm3, %xmm1, %xmm1
 | 
				
			||||||
; AVX1-NEXT:    vpsubw %xmm2, %xmm6, %xmm2
 | 
					; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
 | 
				
			||||||
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vpsubw %xmm5, %xmm1, %xmm1
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vpsubw %xmm3, %xmm4, %xmm2
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 | 
					 | 
				
			||||||
; AVX1-NEXT:    retq
 | 
					; AVX1-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2-LABEL: psubus_32i16_max:
 | 
					; AVX2-LABEL: psubus_32i16_max:
 | 
				
			||||||
; AVX2:       # BB#0: # %vector.ph
 | 
					; AVX2:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX2-NEXT:    vpmaxuw %ymm3, %ymm1, %ymm1
 | 
					; AVX2-NEXT:    vpsubusw %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpmaxuw %ymm2, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpsubusw %ymm3, %ymm1, %ymm1
 | 
				
			||||||
; AVX2-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
 | 
					 | 
				
			||||||
; AVX2-NEXT:    vpsubw %ymm3, %ymm1, %ymm1
 | 
					 | 
				
			||||||
; AVX2-NEXT:    retq
 | 
					; AVX2-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512-LABEL: psubus_32i16_max:
 | 
					; AVX512-LABEL: psubus_32i16_max:
 | 
				
			||||||
; AVX512:       # BB#0: # %vector.ph
 | 
					; AVX512:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX512-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
 | 
					; AVX512-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
 | 
				
			||||||
; AVX512-NEXT:    vpsubw %zmm1, %zmm0, %zmm0
 | 
					 | 
				
			||||||
; AVX512-NEXT:    retq
 | 
					; AVX512-NEXT:    retq
 | 
				
			||||||
vector.ph:
 | 
					vector.ph:
 | 
				
			||||||
  %cmp = icmp ult <32 x i16> %x, %y
 | 
					  %cmp = icmp ult <32 x i16> %x, %y
 | 
				
			||||||
| 
						 | 
					@ -1430,46 +1407,35 @@ vector.ph:
 | 
				
			||||||
define <64 x i8> @psubus_64i8_max(<64 x i8> %x, <64 x i8> %y) nounwind {
 | 
					define <64 x i8> @psubus_64i8_max(<64 x i8> %x, <64 x i8> %y) nounwind {
 | 
				
			||||||
; SSE-LABEL: psubus_64i8_max:
 | 
					; SSE-LABEL: psubus_64i8_max:
 | 
				
			||||||
; SSE:       # BB#0: # %vector.ph
 | 
					; SSE:       # BB#0: # %vector.ph
 | 
				
			||||||
; SSE-NEXT:    pmaxub %xmm7, %xmm3
 | 
					; SSE-NEXT:    psubusb %xmm4, %xmm0
 | 
				
			||||||
; SSE-NEXT:    pmaxub %xmm6, %xmm2
 | 
					; SSE-NEXT:    psubusb %xmm5, %xmm1
 | 
				
			||||||
; SSE-NEXT:    pmaxub %xmm5, %xmm1
 | 
					; SSE-NEXT:    psubusb %xmm6, %xmm2
 | 
				
			||||||
; SSE-NEXT:    pmaxub %xmm4, %xmm0
 | 
					; SSE-NEXT:    psubusb %xmm7, %xmm3
 | 
				
			||||||
; SSE-NEXT:    psubb %xmm4, %xmm0
 | 
					 | 
				
			||||||
; SSE-NEXT:    psubb %xmm5, %xmm1
 | 
					 | 
				
			||||||
; SSE-NEXT:    psubb %xmm6, %xmm2
 | 
					 | 
				
			||||||
; SSE-NEXT:    psubb %xmm7, %xmm3
 | 
					 | 
				
			||||||
; SSE-NEXT:    retq
 | 
					; SSE-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX1-LABEL: psubus_64i8_max:
 | 
					; AVX1-LABEL: psubus_64i8_max:
 | 
				
			||||||
; AVX1:       # BB#0: # %vector.ph
 | 
					; AVX1:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX1-NEXT:    vpmaxub %xmm3, %xmm1, %xmm4
 | 
					; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
 | 
				
			||||||
; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
 | 
					; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 | 
				
			||||||
; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 | 
					; AVX1-NEXT:    vpsubusb %xmm4, %xmm5, %xmm4
 | 
				
			||||||
; AVX1-NEXT:    vpmaxub %xmm5, %xmm1, %xmm1
 | 
					; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
 | 
				
			||||||
; AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm6
 | 
					; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
 | 
				
			||||||
; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
 | 
					; AVX1-NEXT:    vpsubusb %xmm5, %xmm6, %xmm5
 | 
				
			||||||
; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 | 
					; AVX1-NEXT:    vpsubusb %xmm2, %xmm0, %xmm0
 | 
				
			||||||
; AVX1-NEXT:    vpmaxub %xmm7, %xmm0, %xmm0
 | 
					; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
 | 
				
			||||||
; AVX1-NEXT:    vpsubb %xmm7, %xmm0, %xmm0
 | 
					; AVX1-NEXT:    vpsubusb %xmm3, %xmm1, %xmm1
 | 
				
			||||||
; AVX1-NEXT:    vpsubb %xmm2, %xmm6, %xmm2
 | 
					; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
 | 
				
			||||||
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vpsubb %xmm5, %xmm1, %xmm1
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vpsubb %xmm3, %xmm4, %xmm2
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 | 
					 | 
				
			||||||
; AVX1-NEXT:    retq
 | 
					; AVX1-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2-LABEL: psubus_64i8_max:
 | 
					; AVX2-LABEL: psubus_64i8_max:
 | 
				
			||||||
; AVX2:       # BB#0: # %vector.ph
 | 
					; AVX2:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX2-NEXT:    vpmaxub %ymm3, %ymm1, %ymm1
 | 
					; AVX2-NEXT:    vpsubusb %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpmaxub %ymm2, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpsubusb %ymm3, %ymm1, %ymm1
 | 
				
			||||||
; AVX2-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
 | 
					 | 
				
			||||||
; AVX2-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
 | 
					 | 
				
			||||||
; AVX2-NEXT:    retq
 | 
					; AVX2-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512-LABEL: psubus_64i8_max:
 | 
					; AVX512-LABEL: psubus_64i8_max:
 | 
				
			||||||
; AVX512:       # BB#0: # %vector.ph
 | 
					; AVX512:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
 | 
					; AVX512-NEXT:    vpsubusb %zmm1, %zmm0, %zmm0
 | 
				
			||||||
; AVX512-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 | 
					 | 
				
			||||||
; AVX512-NEXT:    retq
 | 
					; AVX512-NEXT:    retq
 | 
				
			||||||
vector.ph:
 | 
					vector.ph:
 | 
				
			||||||
  %cmp = icmp ult <64 x i8> %x, %y
 | 
					  %cmp = icmp ult <64 x i8> %x, %y
 | 
				
			||||||
| 
						 | 
					@ -1481,33 +1447,27 @@ vector.ph:
 | 
				
			||||||
define <32 x i8> @psubus_32i8_max(<32 x i8> %x, <32 x i8> %y) nounwind {
 | 
					define <32 x i8> @psubus_32i8_max(<32 x i8> %x, <32 x i8> %y) nounwind {
 | 
				
			||||||
; SSE-LABEL: psubus_32i8_max:
 | 
					; SSE-LABEL: psubus_32i8_max:
 | 
				
			||||||
; SSE:       # BB#0: # %vector.ph
 | 
					; SSE:       # BB#0: # %vector.ph
 | 
				
			||||||
; SSE-NEXT:    pmaxub %xmm3, %xmm1
 | 
					; SSE-NEXT:    psubusb %xmm2, %xmm0
 | 
				
			||||||
; SSE-NEXT:    pmaxub %xmm2, %xmm0
 | 
					; SSE-NEXT:    psubusb %xmm3, %xmm1
 | 
				
			||||||
; SSE-NEXT:    psubb %xmm2, %xmm0
 | 
					 | 
				
			||||||
; SSE-NEXT:    psubb %xmm3, %xmm1
 | 
					 | 
				
			||||||
; SSE-NEXT:    retq
 | 
					; SSE-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX1-LABEL: psubus_32i8_max:
 | 
					; AVX1-LABEL: psubus_32i8_max:
 | 
				
			||||||
; AVX1:       # BB#0: # %vector.ph
 | 
					; AVX1:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm2
 | 
					; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 | 
				
			||||||
; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 | 
					; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 | 
				
			||||||
; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 | 
					; AVX1-NEXT:    vpsubusb %xmm2, %xmm3, %xmm2
 | 
				
			||||||
; AVX1-NEXT:    vpmaxub %xmm3, %xmm0, %xmm0
 | 
					; AVX1-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
 | 
				
			||||||
; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
 | 
					; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 | 
					 | 
				
			||||||
; AVX1-NEXT:    retq
 | 
					; AVX1-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2-LABEL: psubus_32i8_max:
 | 
					; AVX2-LABEL: psubus_32i8_max:
 | 
				
			||||||
; AVX2:       # BB#0: # %vector.ph
 | 
					; AVX2:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 | 
					 | 
				
			||||||
; AVX2-NEXT:    retq
 | 
					; AVX2-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512-LABEL: psubus_32i8_max:
 | 
					; AVX512-LABEL: psubus_32i8_max:
 | 
				
			||||||
; AVX512:       # BB#0: # %vector.ph
 | 
					; AVX512:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
 | 
					; AVX512-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 | 
					 | 
				
			||||||
; AVX512-NEXT:    retq
 | 
					; AVX512-NEXT:    retq
 | 
				
			||||||
vector.ph:
 | 
					vector.ph:
 | 
				
			||||||
  %cmp = icmp ult <32 x i8> %x, %y
 | 
					  %cmp = icmp ult <32 x i8> %x, %y
 | 
				
			||||||
| 
						 | 
					@ -1586,53 +1546,41 @@ define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind {
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; SSE41-LABEL: psubus_8i32_max:
 | 
					; SSE41-LABEL: psubus_8i32_max:
 | 
				
			||||||
; SSE41:       # BB#0: # %vector.ph
 | 
					; SSE41:       # BB#0: # %vector.ph
 | 
				
			||||||
; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
 | 
					; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
 | 
				
			||||||
; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 | 
					; SSE41-NEXT:    pminud %xmm3, %xmm2
 | 
				
			||||||
; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 | 
					; SSE41-NEXT:    pminud %xmm3, %xmm1
 | 
				
			||||||
; SSE41-NEXT:    pmaxud %xmm1, %xmm0
 | 
					; SSE41-NEXT:    packusdw %xmm2, %xmm1
 | 
				
			||||||
; SSE41-NEXT:    pmaxud %xmm2, %xmm3
 | 
					; SSE41-NEXT:    psubusw %xmm1, %xmm0
 | 
				
			||||||
; SSE41-NEXT:    psubd %xmm2, %xmm3
 | 
					 | 
				
			||||||
; SSE41-NEXT:    psubd %xmm1, %xmm0
 | 
					 | 
				
			||||||
; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 | 
					 | 
				
			||||||
; SSE41-NEXT:    pshufb %xmm1, %xmm0
 | 
					 | 
				
			||||||
; SSE41-NEXT:    pshufb %xmm1, %xmm3
 | 
					 | 
				
			||||||
; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
 | 
					 | 
				
			||||||
; SSE41-NEXT:    retq
 | 
					; SSE41-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX1-LABEL: psubus_8i32_max:
 | 
					; AVX1-LABEL: psubus_8i32_max:
 | 
				
			||||||
; AVX1:       # BB#0: # %vector.ph
 | 
					; AVX1:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
 | 
					; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 | 
				
			||||||
; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 | 
					; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 | 
				
			||||||
; AVX1-NEXT:    vpmaxud %xmm3, %xmm2, %xmm2
 | 
					; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
 | 
				
			||||||
; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm2
 | 
					; AVX1-NEXT:    vpminud %xmm4, %xmm3, %xmm3
 | 
				
			||||||
; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 | 
					; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 | 
				
			||||||
; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 | 
					; AVX1-NEXT:    vpminud %xmm4, %xmm1, %xmm1
 | 
				
			||||||
; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 | 
					; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 | 
				
			||||||
; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
 | 
					; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 | 
				
			||||||
; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 | 
					; AVX1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
 | 
				
			||||||
; AVX1-NEXT:    vzeroupper
 | 
					; AVX1-NEXT:    vzeroupper
 | 
				
			||||||
; AVX1-NEXT:    retq
 | 
					; AVX1-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2-LABEL: psubus_8i32_max:
 | 
					; AVX2-LABEL: psubus_8i32_max:
 | 
				
			||||||
; AVX2:       # BB#0: # %vector.ph
 | 
					; AVX2:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 | 
					; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
 | 
				
			||||||
; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpminud %ymm2, %ymm1, %ymm1
 | 
				
			||||||
; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 | 
				
			||||||
; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 | 
					; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 | 
				
			||||||
; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 | 
					; AVX2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
 | 
				
			||||||
; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 | 
					 | 
				
			||||||
; AVX2-NEXT:    vzeroupper
 | 
					; AVX2-NEXT:    vzeroupper
 | 
				
			||||||
; AVX2-NEXT:    retq
 | 
					; AVX2-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512-LABEL: psubus_8i32_max:
 | 
					; AVX512-LABEL: psubus_8i32_max:
 | 
				
			||||||
; AVX512:       # BB#0: # %vector.ph
 | 
					; AVX512:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 | 
					; AVX512-NEXT:    vpmovusdw %ymm1, %xmm1
 | 
				
			||||||
; AVX512-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
 | 
					; AVX512-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
 | 
				
			||||||
; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
 | 
					 | 
				
			||||||
; AVX512-NEXT:    vpmovdw %ymm0, %xmm0
 | 
					 | 
				
			||||||
; AVX512-NEXT:    vzeroupper
 | 
					; AVX512-NEXT:    vzeroupper
 | 
				
			||||||
; AVX512-NEXT:    retq
 | 
					; AVX512-NEXT:    retq
 | 
				
			||||||
vector.ph:
 | 
					vector.ph:
 | 
				
			||||||
| 
						 | 
					@ -1986,10 +1934,8 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512-LABEL: psubus_8i64_max:
 | 
					; AVX512-LABEL: psubus_8i64_max:
 | 
				
			||||||
; AVX512:       # BB#0: # %vector.ph
 | 
					; AVX512:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 | 
					; AVX512-NEXT:    vpmovusqw %zmm1, %xmm1
 | 
				
			||||||
; AVX512-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
 | 
					; AVX512-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
 | 
				
			||||||
; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
 | 
					 | 
				
			||||||
; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
 | 
					 | 
				
			||||||
; AVX512-NEXT:    vzeroupper
 | 
					; AVX512-NEXT:    vzeroupper
 | 
				
			||||||
; AVX512-NEXT:    retq
 | 
					; AVX512-NEXT:    retq
 | 
				
			||||||
vector.ph:
 | 
					vector.ph:
 | 
				
			||||||
| 
						 | 
					@ -2155,56 +2101,59 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX1-LABEL: psubus_16i32_max:
 | 
					; AVX1-LABEL: psubus_16i32_max:
 | 
				
			||||||
; AVX1:       # BB#0: # %vector.ph
 | 
					; AVX1:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
 | 
					; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 | 
				
			||||||
; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 | 
					; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
 | 
				
			||||||
; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 | 
					; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [65535,65535,65535,65535]
 | 
				
			||||||
; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 | 
					; AVX1-NEXT:    vpminud %xmm5, %xmm4, %xmm4
 | 
				
			||||||
; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
 | 
					; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
 | 
				
			||||||
; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
 | 
					; AVX1-NEXT:    vpminud %xmm5, %xmm2, %xmm2
 | 
				
			||||||
; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 | 
					; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 | 
				
			||||||
; AVX1-NEXT:    vpmaxud %xmm2, %xmm0, %xmm0
 | 
					; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
 | 
				
			||||||
; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
 | 
					; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
 | 
				
			||||||
; AVX1-NEXT:    vpmaxud %xmm6, %xmm5, %xmm5
 | 
					; AVX1-NEXT:    vpminud %xmm5, %xmm4, %xmm4
 | 
				
			||||||
; AVX1-NEXT:    vpmaxud %xmm1, %xmm4, %xmm4
 | 
					; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
 | 
				
			||||||
; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
 | 
					; AVX1-NEXT:    vpminud %xmm5, %xmm1, %xmm1
 | 
				
			||||||
; AVX1-NEXT:    vpmaxud %xmm7, %xmm3, %xmm3
 | 
					; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 | 
				
			||||||
; AVX1-NEXT:    vpsubd %xmm7, %xmm3, %xmm3
 | 
					; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
 | 
				
			||||||
; AVX1-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
 | 
					; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 | 
				
			||||||
; AVX1-NEXT:    vpsubd %xmm6, %xmm5, %xmm4
 | 
					; AVX1-NEXT:    vpsubusw %xmm2, %xmm3, %xmm2
 | 
				
			||||||
; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
 | 
					; AVX1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
 | 
				
			||||||
; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | 
					; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 | 
				
			||||||
; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
 | 
					; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 | 
				
			||||||
; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4],xmm2[5],xmm4[6],xmm2[7]
 | 
					; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 | 
				
			||||||
; AVX1-NEXT:    vpackusdw %xmm4, %xmm0, %xmm0
 | 
					; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
 | 
				
			||||||
; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
 | 
					; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
 | 
				
			||||||
; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
 | 
					; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
 | 
				
			||||||
; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
 | 
					; AVX1-NEXT:    vpackusdw %xmm0, %xmm1, %xmm0
 | 
				
			||||||
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 | 
					; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX1-NEXT:    retq
 | 
					; AVX1-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2-LABEL: psubus_16i32_max:
 | 
					; AVX2-LABEL: psubus_16i32_max:
 | 
				
			||||||
; AVX2:       # BB#0: # %vector.ph
 | 
					; AVX2:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
 | 
					; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 | 
				
			||||||
; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
 | 
					; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535]
 | 
				
			||||||
; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 | 
					; AVX2-NEXT:    vpminud %ymm4, %ymm1, %ymm1
 | 
				
			||||||
; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 | 
				
			||||||
; AVX2-NEXT:    vpmaxud %ymm2, %ymm3, %ymm3
 | 
					 | 
				
			||||||
; AVX2-NEXT:    vpsubd %ymm2, %ymm3, %ymm2
 | 
					 | 
				
			||||||
; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
 | 
					 | 
				
			||||||
; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 | 
					 | 
				
			||||||
; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 | 
					 | 
				
			||||||
; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 | 
					 | 
				
			||||||
; AVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
 | 
					 | 
				
			||||||
; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 | 
					; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 | 
				
			||||||
; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpminud %ymm4, %ymm2, %ymm2
 | 
				
			||||||
 | 
					; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 | 
				
			||||||
 | 
					; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
 | 
				
			||||||
 | 
					; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 | 
				
			||||||
 | 
					; AVX2-NEXT:    vpackusdw %ymm0, %ymm3, %ymm3
 | 
				
			||||||
 | 
					; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
 | 
				
			||||||
 | 
					; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
 | 
				
			||||||
 | 
					; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 | 
				
			||||||
 | 
					; AVX2-NEXT:    vpackusdw %ymm0, %ymm0, %ymm0
 | 
				
			||||||
 | 
					; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 | 
				
			||||||
 | 
					; AVX2-NEXT:    vpsubusw %xmm2, %xmm0, %xmm0
 | 
				
			||||||
 | 
					; AVX2-NEXT:    vpsubusw %xmm1, %xmm3, %xmm1
 | 
				
			||||||
 | 
					; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    retq
 | 
					; AVX2-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512-LABEL: psubus_16i32_max:
 | 
					; AVX512-LABEL: psubus_16i32_max:
 | 
				
			||||||
; AVX512:       # BB#0: # %vector.ph
 | 
					; AVX512:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 | 
					; AVX512-NEXT:    vpmovusdw %zmm1, %ymm1
 | 
				
			||||||
; AVX512-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
 | 
					; AVX512-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
 | 
					 | 
				
			||||||
; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
 | 
					 | 
				
			||||||
; AVX512-NEXT:    retq
 | 
					; AVX512-NEXT:    retq
 | 
				
			||||||
vector.ph:
 | 
					vector.ph:
 | 
				
			||||||
  %lhs = zext <16 x i16> %x to <16 x i32>
 | 
					  %lhs = zext <16 x i16> %x to <16 x i32>
 | 
				
			||||||
| 
						 | 
					@ -2281,53 +2230,41 @@ define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwin
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; SSE41-LABEL: psubus_i16_i32_max_swapped:
 | 
					; SSE41-LABEL: psubus_i16_i32_max_swapped:
 | 
				
			||||||
; SSE41:       # BB#0: # %vector.ph
 | 
					; SSE41:       # BB#0: # %vector.ph
 | 
				
			||||||
; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
 | 
					; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
 | 
				
			||||||
; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 | 
					; SSE41-NEXT:    pminud %xmm3, %xmm2
 | 
				
			||||||
; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 | 
					; SSE41-NEXT:    pminud %xmm3, %xmm1
 | 
				
			||||||
; SSE41-NEXT:    pmaxud %xmm1, %xmm0
 | 
					; SSE41-NEXT:    packusdw %xmm2, %xmm1
 | 
				
			||||||
; SSE41-NEXT:    pmaxud %xmm2, %xmm3
 | 
					; SSE41-NEXT:    psubusw %xmm1, %xmm0
 | 
				
			||||||
; SSE41-NEXT:    psubd %xmm2, %xmm3
 | 
					 | 
				
			||||||
; SSE41-NEXT:    psubd %xmm1, %xmm0
 | 
					 | 
				
			||||||
; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 | 
					 | 
				
			||||||
; SSE41-NEXT:    pshufb %xmm1, %xmm0
 | 
					 | 
				
			||||||
; SSE41-NEXT:    pshufb %xmm1, %xmm3
 | 
					 | 
				
			||||||
; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
 | 
					 | 
				
			||||||
; SSE41-NEXT:    retq
 | 
					; SSE41-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX1-LABEL: psubus_i16_i32_max_swapped:
 | 
					; AVX1-LABEL: psubus_i16_i32_max_swapped:
 | 
				
			||||||
; AVX1:       # BB#0: # %vector.ph
 | 
					; AVX1:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
 | 
					; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 | 
				
			||||||
; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm0
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 | 
					; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 | 
				
			||||||
; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
 | 
					; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
 | 
				
			||||||
; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm2
 | 
					; AVX1-NEXT:    vpminud %xmm4, %xmm3, %xmm3
 | 
				
			||||||
; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 | 
					; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 | 
				
			||||||
; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 | 
					; AVX1-NEXT:    vpminud %xmm4, %xmm1, %xmm1
 | 
				
			||||||
; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 | 
					; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 | 
				
			||||||
; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
 | 
					; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 | 
				
			||||||
; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 | 
					; AVX1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
 | 
				
			||||||
; AVX1-NEXT:    vzeroupper
 | 
					; AVX1-NEXT:    vzeroupper
 | 
				
			||||||
; AVX1-NEXT:    retq
 | 
					; AVX1-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2-LABEL: psubus_i16_i32_max_swapped:
 | 
					; AVX2-LABEL: psubus_i16_i32_max_swapped:
 | 
				
			||||||
; AVX2:       # BB#0: # %vector.ph
 | 
					; AVX2:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 | 
					; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
 | 
				
			||||||
; AVX2-NEXT:    vpmaxud %ymm0, %ymm1, %ymm0
 | 
					; AVX2-NEXT:    vpminud %ymm2, %ymm1, %ymm1
 | 
				
			||||||
; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 | 
				
			||||||
; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 | 
					; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 | 
				
			||||||
; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 | 
					; AVX2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
 | 
				
			||||||
; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 | 
					 | 
				
			||||||
; AVX2-NEXT:    vzeroupper
 | 
					; AVX2-NEXT:    vzeroupper
 | 
				
			||||||
; AVX2-NEXT:    retq
 | 
					; AVX2-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512-LABEL: psubus_i16_i32_max_swapped:
 | 
					; AVX512-LABEL: psubus_i16_i32_max_swapped:
 | 
				
			||||||
; AVX512:       # BB#0: # %vector.ph
 | 
					; AVX512:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 | 
					; AVX512-NEXT:    vpmovusdw %ymm1, %xmm1
 | 
				
			||||||
; AVX512-NEXT:    vpmaxud %ymm0, %ymm1, %ymm0
 | 
					; AVX512-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
 | 
				
			||||||
; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
 | 
					 | 
				
			||||||
; AVX512-NEXT:    vpmovdw %ymm0, %xmm0
 | 
					 | 
				
			||||||
; AVX512-NEXT:    vzeroupper
 | 
					; AVX512-NEXT:    vzeroupper
 | 
				
			||||||
; AVX512-NEXT:    retq
 | 
					; AVX512-NEXT:    retq
 | 
				
			||||||
vector.ph:
 | 
					vector.ph:
 | 
				
			||||||
| 
						 | 
					@ -2407,53 +2344,41 @@ define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind {
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; SSE41-LABEL: psubus_i16_i32_min:
 | 
					; SSE41-LABEL: psubus_i16_i32_min:
 | 
				
			||||||
; SSE41:       # BB#0: # %vector.ph
 | 
					; SSE41:       # BB#0: # %vector.ph
 | 
				
			||||||
; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
 | 
					; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
 | 
				
			||||||
; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
 | 
					 | 
				
			||||||
; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 | 
					 | 
				
			||||||
; SSE41-NEXT:    pminud %xmm0, %xmm1
 | 
					 | 
				
			||||||
; SSE41-NEXT:    pminud %xmm3, %xmm2
 | 
					; SSE41-NEXT:    pminud %xmm3, %xmm2
 | 
				
			||||||
; SSE41-NEXT:    psubd %xmm2, %xmm3
 | 
					; SSE41-NEXT:    pminud %xmm3, %xmm1
 | 
				
			||||||
; SSE41-NEXT:    psubd %xmm1, %xmm0
 | 
					; SSE41-NEXT:    packusdw %xmm2, %xmm1
 | 
				
			||||||
; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 | 
					; SSE41-NEXT:    psubusw %xmm1, %xmm0
 | 
				
			||||||
; SSE41-NEXT:    pshufb %xmm1, %xmm0
 | 
					 | 
				
			||||||
; SSE41-NEXT:    pshufb %xmm1, %xmm3
 | 
					 | 
				
			||||||
; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
 | 
					 | 
				
			||||||
; SSE41-NEXT:    retq
 | 
					; SSE41-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX1-LABEL: psubus_i16_i32_min:
 | 
					; AVX1-LABEL: psubus_i16_i32_min:
 | 
				
			||||||
; AVX1:       # BB#0: # %vector.ph
 | 
					; AVX1:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm3
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vpminud %xmm1, %xmm2, %xmm1
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
 | 
					 | 
				
			||||||
; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 | 
					; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 | 
				
			||||||
; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 | 
					; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 | 
				
			||||||
 | 
					; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
 | 
				
			||||||
 | 
					; AVX1-NEXT:    vpminud %xmm4, %xmm3, %xmm3
 | 
				
			||||||
 | 
					; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 | 
				
			||||||
 | 
					; AVX1-NEXT:    vpminud %xmm4, %xmm1, %xmm1
 | 
				
			||||||
; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 | 
					; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 | 
				
			||||||
; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 | 
					; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 | 
				
			||||||
 | 
					; AVX1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
 | 
				
			||||||
; AVX1-NEXT:    vzeroupper
 | 
					; AVX1-NEXT:    vzeroupper
 | 
				
			||||||
; AVX1-NEXT:    retq
 | 
					; AVX1-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2-LABEL: psubus_i16_i32_min:
 | 
					; AVX2-LABEL: psubus_i16_i32_min:
 | 
				
			||||||
; AVX2:       # BB#0: # %vector.ph
 | 
					; AVX2:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 | 
					; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
 | 
				
			||||||
; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm1
 | 
					; AVX2-NEXT:    vpminud %ymm2, %ymm1, %ymm1
 | 
				
			||||||
; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 | 
				
			||||||
; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 | 
					; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 | 
				
			||||||
; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 | 
					; AVX2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
 | 
				
			||||||
; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 | 
					 | 
				
			||||||
; AVX2-NEXT:    vzeroupper
 | 
					; AVX2-NEXT:    vzeroupper
 | 
				
			||||||
; AVX2-NEXT:    retq
 | 
					; AVX2-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512-LABEL: psubus_i16_i32_min:
 | 
					; AVX512-LABEL: psubus_i16_i32_min:
 | 
				
			||||||
; AVX512:       # BB#0: # %vector.ph
 | 
					; AVX512:       # BB#0: # %vector.ph
 | 
				
			||||||
; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 | 
					; AVX512-NEXT:    vpmovusdw %ymm1, %xmm1
 | 
				
			||||||
; AVX512-NEXT:    vpminud %ymm1, %ymm0, %ymm1
 | 
					; AVX512-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
 | 
				
			||||||
; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
 | 
					 | 
				
			||||||
; AVX512-NEXT:    vpmovdw %ymm0, %xmm0
 | 
					 | 
				
			||||||
; AVX512-NEXT:    vzeroupper
 | 
					; AVX512-NEXT:    vzeroupper
 | 
				
			||||||
; AVX512-NEXT:    retq
 | 
					; AVX512-NEXT:    retq
 | 
				
			||||||
vector.ph:
 | 
					vector.ph:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue