forked from OSchip/llvm-project
				
			[X86][AVX] Enable INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)) shuffle combining
Push the insert_subvector up through the shuffle operands to help find more cross-lane shuffles. The is exposes a couple of minor issues that will be fixed shortly: Missed broadcast folds - we have a mixture of vzext_load lengths that need cleaning up combine-sdiv.ll - AVX1 SimplifyDemandedVectorElts failure (hits max depth due to a couple of extra bitcasts). llvm-svn: 352963
This commit is contained in:
		
							parent
							
								
									bd42f97946
								
							
						
					
					
						commit
						dbf302c9f1
					
				| 
						 | 
					@ -6500,14 +6500,14 @@ static bool setTargetShuffleZeroElements(SDValue N,
 | 
				
			||||||
static bool resolveTargetShuffleInputs(SDValue Op,
 | 
					static bool resolveTargetShuffleInputs(SDValue Op,
 | 
				
			||||||
                                       SmallVectorImpl<SDValue> &Inputs,
 | 
					                                       SmallVectorImpl<SDValue> &Inputs,
 | 
				
			||||||
                                       SmallVectorImpl<int> &Mask,
 | 
					                                       SmallVectorImpl<int> &Mask,
 | 
				
			||||||
                                       const SelectionDAG &DAG);
 | 
					                                       SelectionDAG &DAG);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Attempt to decode ops that could be represented as a shuffle mask.
 | 
					// Attempt to decode ops that could be represented as a shuffle mask.
 | 
				
			||||||
// The decoded shuffle mask may contain a different number of elements to the
 | 
					// The decoded shuffle mask may contain a different number of elements to the
 | 
				
			||||||
// destination value type.
 | 
					// destination value type.
 | 
				
			||||||
static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
 | 
					static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
 | 
				
			||||||
                               SmallVectorImpl<SDValue> &Ops,
 | 
					                               SmallVectorImpl<SDValue> &Ops,
 | 
				
			||||||
                               const SelectionDAG &DAG) {
 | 
					                               SelectionDAG &DAG) {
 | 
				
			||||||
  Mask.clear();
 | 
					  Mask.clear();
 | 
				
			||||||
  Ops.clear();
 | 
					  Ops.clear();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6590,8 +6590,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
 | 
				
			||||||
    return true;
 | 
					    return true;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  case ISD::INSERT_SUBVECTOR: {
 | 
					  case ISD::INSERT_SUBVECTOR: {
 | 
				
			||||||
    // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(EXTRACT_SUBVECTOR(SRC1)) where
 | 
					    // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
 | 
				
			||||||
    // SRC0/SRC1 are both of the same valuetype VT.
 | 
					 | 
				
			||||||
    SDValue Src = N.getOperand(0);
 | 
					    SDValue Src = N.getOperand(0);
 | 
				
			||||||
    SDValue Sub = N.getOperand(1);
 | 
					    SDValue Sub = N.getOperand(1);
 | 
				
			||||||
    EVT SubVT = Sub.getValueType();
 | 
					    EVT SubVT = Sub.getValueType();
 | 
				
			||||||
| 
						 | 
					@ -6604,25 +6603,38 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
 | 
				
			||||||
    if (!resolveTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
 | 
					    if (!resolveTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
 | 
				
			||||||
                                    SubMask, DAG))
 | 
					                                    SubMask, DAG))
 | 
				
			||||||
      return false;
 | 
					      return false;
 | 
				
			||||||
    if (SubMask.size() != NumSubElts)
 | 
					    int InsertIdx = N.getConstantOperandVal(2);
 | 
				
			||||||
      return false;
 | 
					    if (SubMask.size() != NumSubElts) {
 | 
				
			||||||
 | 
					      assert(((SubMask.size() % NumSubElts) == 0 ||
 | 
				
			||||||
 | 
					              (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
 | 
				
			||||||
 | 
					      if ((NumSubElts % SubMask.size()) == 0) {
 | 
				
			||||||
 | 
					        int Scale = NumSubElts / SubMask.size();
 | 
				
			||||||
 | 
					        SmallVector<int,64> ScaledSubMask;
 | 
				
			||||||
 | 
					        scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask);
 | 
				
			||||||
 | 
					        SubMask = ScaledSubMask;
 | 
				
			||||||
 | 
					      } else {
 | 
				
			||||||
 | 
					        int Scale = SubMask.size() / NumSubElts;
 | 
				
			||||||
 | 
					        NumSubElts = SubMask.size();
 | 
				
			||||||
 | 
					        NumElts *= Scale;
 | 
				
			||||||
 | 
					        InsertIdx *= Scale;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
    Ops.push_back(Src);
 | 
					    Ops.push_back(Src);
 | 
				
			||||||
    for (SDValue &SubInput : SubInputs) {
 | 
					    for (SDValue &SubInput : SubInputs) {
 | 
				
			||||||
      if (SubInput.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
 | 
					      EVT SubSVT = SubInput.getValueType().getScalarType();
 | 
				
			||||||
          SubInput.getOperand(0).getValueType() != VT ||
 | 
					      EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT,
 | 
				
			||||||
          !isa<ConstantSDNode>(SubInput.getOperand(1)))
 | 
					                                   NumSizeInBits / SubSVT.getSizeInBits());
 | 
				
			||||||
        return false;
 | 
					      Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT,
 | 
				
			||||||
      Ops.push_back(SubInput.getOperand(0));
 | 
					                                DAG.getUNDEF(AltVT), SubInput,
 | 
				
			||||||
 | 
					                                DAG.getIntPtrConstant(0, SDLoc(N))));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    int InsertIdx = N.getConstantOperandVal(2);
 | 
					 | 
				
			||||||
    for (int i = 0; i != (int)NumElts; ++i)
 | 
					    for (int i = 0; i != (int)NumElts; ++i)
 | 
				
			||||||
      Mask.push_back(i);
 | 
					      Mask.push_back(i);
 | 
				
			||||||
    for (int i = 0; i != (int)NumSubElts; ++i) {
 | 
					    for (int i = 0; i != (int)NumSubElts; ++i) {
 | 
				
			||||||
      int M = SubMask[i];
 | 
					      int M = SubMask[i];
 | 
				
			||||||
      if (0 <= M) {
 | 
					      if (0 <= M) {
 | 
				
			||||||
        int InputIdx = M / NumSubElts;
 | 
					        int InputIdx = M / NumSubElts;
 | 
				
			||||||
        int ExtractIdx = SubInputs[InputIdx].getConstantOperandVal(1);
 | 
					        M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
 | 
				
			||||||
        M = (NumElts * (1 + InputIdx)) + ExtractIdx + (M % NumSubElts);
 | 
					 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
      Mask[i + InsertIdx] = M;
 | 
					      Mask[i + InsertIdx] = M;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
| 
						 | 
					@ -6813,7 +6825,7 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
 | 
				
			||||||
static bool resolveTargetShuffleInputs(SDValue Op,
 | 
					static bool resolveTargetShuffleInputs(SDValue Op,
 | 
				
			||||||
                                       SmallVectorImpl<SDValue> &Inputs,
 | 
					                                       SmallVectorImpl<SDValue> &Inputs,
 | 
				
			||||||
                                       SmallVectorImpl<int> &Mask,
 | 
					                                       SmallVectorImpl<int> &Mask,
 | 
				
			||||||
                                       const SelectionDAG &DAG) {
 | 
					                                       SelectionDAG &DAG) {
 | 
				
			||||||
  if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
 | 
					  if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
 | 
				
			||||||
    if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
 | 
					    if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
 | 
				
			||||||
      return false;
 | 
					      return false;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -536,9 +536,8 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i3
 | 
				
			||||||
define <16 x i32> @test_2xi32_to_16xi32_mem(<2 x i32>* %vp) {
 | 
					define <16 x i32> @test_2xi32_to_16xi32_mem(<2 x i32>* %vp) {
 | 
				
			||||||
; CHECK-LABEL: test_2xi32_to_16xi32_mem:
 | 
					; CHECK-LABEL: test_2xi32_to_16xi32_mem:
 | 
				
			||||||
; CHECK:       # %bb.0:
 | 
					; CHECK:       # %bb.0:
 | 
				
			||||||
; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
 | 
					; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 | 
				
			||||||
; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 | 
					; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
 | 
				
			||||||
; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 | 
					 | 
				
			||||||
; CHECK-NEXT:    retq
 | 
					; CHECK-NEXT:    retq
 | 
				
			||||||
  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
					  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
				
			||||||
  %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
					  %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
				
			||||||
| 
						 | 
					@ -547,10 +546,10 @@ define <16 x i32> @test_2xi32_to_16xi32_mem(<2 x i32>* %vp) {
 | 
				
			||||||
define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask0(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
 | 
					define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask0(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
 | 
				
			||||||
; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask0:
 | 
					; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask0:
 | 
				
			||||||
; CHECK:       # %bb.0:
 | 
					; CHECK:       # %bb.0:
 | 
				
			||||||
; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
 | 
					; CHECK-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 | 
				
			||||||
; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 | 
					; CHECK-NEXT:    vpbroadcastq %xmm2, %zmm2
 | 
				
			||||||
; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 | 
					; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 | 
				
			||||||
; CHECK-NEXT:    vpermd %zmm2, %zmm3, %zmm0 {%k1}
 | 
					; CHECK-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
 | 
				
			||||||
; CHECK-NEXT:    retq
 | 
					; CHECK-NEXT:    retq
 | 
				
			||||||
  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
					  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
				
			||||||
  %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
					  %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
				
			||||||
| 
						 | 
					@ -562,10 +561,10 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask0(<2 x i32>* %vp, <16 x i
 | 
				
			||||||
define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask0(<2 x i32>* %vp, <16 x i32> %mask) {
 | 
					define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask0(<2 x i32>* %vp, <16 x i32> %mask) {
 | 
				
			||||||
; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask0:
 | 
					; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask0:
 | 
				
			||||||
; CHECK:       # %bb.0:
 | 
					; CHECK:       # %bb.0:
 | 
				
			||||||
; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 | 
					; CHECK-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 | 
				
			||||||
; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 | 
					; CHECK-NEXT:    vpbroadcastq %xmm1, %zmm1
 | 
				
			||||||
; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 | 
					; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 | 
				
			||||||
; CHECK-NEXT:    vpermd %zmm1, %zmm2, %zmm0 {%k1} {z}
 | 
					; CHECK-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
 | 
				
			||||||
; CHECK-NEXT:    retq
 | 
					; CHECK-NEXT:    retq
 | 
				
			||||||
  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
					  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
				
			||||||
  %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
					  %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
				
			||||||
| 
						 | 
					@ -576,10 +575,10 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask0(<2 x i32>* %vp, <16 x
 | 
				
			||||||
define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask1(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
 | 
					define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask1(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
 | 
				
			||||||
; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask1:
 | 
					; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask1:
 | 
				
			||||||
; CHECK:       # %bb.0:
 | 
					; CHECK:       # %bb.0:
 | 
				
			||||||
; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
 | 
					; CHECK-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 | 
				
			||||||
; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 | 
					; CHECK-NEXT:    vpbroadcastq %xmm2, %zmm2
 | 
				
			||||||
; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 | 
					; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 | 
				
			||||||
; CHECK-NEXT:    vpermd %zmm2, %zmm3, %zmm0 {%k1}
 | 
					; CHECK-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
 | 
				
			||||||
; CHECK-NEXT:    retq
 | 
					; CHECK-NEXT:    retq
 | 
				
			||||||
  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
					  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
				
			||||||
  %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
					  %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
				
			||||||
| 
						 | 
					@ -591,10 +590,10 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask1(<2 x i32>* %vp, <16 x i
 | 
				
			||||||
define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask1(<2 x i32>* %vp, <16 x i32> %mask) {
 | 
					define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask1(<2 x i32>* %vp, <16 x i32> %mask) {
 | 
				
			||||||
; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask1:
 | 
					; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask1:
 | 
				
			||||||
; CHECK:       # %bb.0:
 | 
					; CHECK:       # %bb.0:
 | 
				
			||||||
; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 | 
					; CHECK-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 | 
				
			||||||
; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 | 
					; CHECK-NEXT:    vpbroadcastq %xmm1, %zmm1
 | 
				
			||||||
; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 | 
					; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 | 
				
			||||||
; CHECK-NEXT:    vpermd %zmm1, %zmm2, %zmm0 {%k1} {z}
 | 
					; CHECK-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
 | 
				
			||||||
; CHECK-NEXT:    retq
 | 
					; CHECK-NEXT:    retq
 | 
				
			||||||
  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
					  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
				
			||||||
  %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
					  %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
				
			||||||
| 
						 | 
					@ -605,10 +604,10 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask1(<2 x i32>* %vp, <16 x
 | 
				
			||||||
define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask2(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
 | 
					define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask2(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
 | 
				
			||||||
; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask2:
 | 
					; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask2:
 | 
				
			||||||
; CHECK:       # %bb.0:
 | 
					; CHECK:       # %bb.0:
 | 
				
			||||||
; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
 | 
					; CHECK-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 | 
				
			||||||
; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 | 
					; CHECK-NEXT:    vpbroadcastq %xmm2, %zmm2
 | 
				
			||||||
; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 | 
					; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 | 
				
			||||||
; CHECK-NEXT:    vpermd %zmm2, %zmm3, %zmm0 {%k1}
 | 
					; CHECK-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
 | 
				
			||||||
; CHECK-NEXT:    retq
 | 
					; CHECK-NEXT:    retq
 | 
				
			||||||
  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
					  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
				
			||||||
  %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
					  %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
				
			||||||
| 
						 | 
					@ -620,10 +619,10 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask2(<2 x i32>* %vp, <16 x i
 | 
				
			||||||
define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask2(<2 x i32>* %vp, <16 x i32> %mask) {
 | 
					define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask2(<2 x i32>* %vp, <16 x i32> %mask) {
 | 
				
			||||||
; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask2:
 | 
					; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask2:
 | 
				
			||||||
; CHECK:       # %bb.0:
 | 
					; CHECK:       # %bb.0:
 | 
				
			||||||
; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 | 
					; CHECK-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 | 
				
			||||||
; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 | 
					; CHECK-NEXT:    vpbroadcastq %xmm1, %zmm1
 | 
				
			||||||
; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 | 
					; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 | 
				
			||||||
; CHECK-NEXT:    vpermd %zmm1, %zmm2, %zmm0 {%k1} {z}
 | 
					; CHECK-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
 | 
				
			||||||
; CHECK-NEXT:    retq
 | 
					; CHECK-NEXT:    retq
 | 
				
			||||||
  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
					  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
				
			||||||
  %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
					  %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
				
			||||||
| 
						 | 
					@ -634,10 +633,10 @@ define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask2(<2 x i32>* %vp, <16 x
 | 
				
			||||||
define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask3(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
 | 
					define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask3(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
 | 
				
			||||||
; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask3:
 | 
					; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask3:
 | 
				
			||||||
; CHECK:       # %bb.0:
 | 
					; CHECK:       # %bb.0:
 | 
				
			||||||
; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
 | 
					; CHECK-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 | 
				
			||||||
; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 | 
					; CHECK-NEXT:    vpbroadcastq %xmm2, %zmm2
 | 
				
			||||||
; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 | 
					; CHECK-NEXT:    vptestnmd %zmm1, %zmm1, %k1
 | 
				
			||||||
; CHECK-NEXT:    vpermd %zmm2, %zmm3, %zmm0 {%k1}
 | 
					; CHECK-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
 | 
				
			||||||
; CHECK-NEXT:    retq
 | 
					; CHECK-NEXT:    retq
 | 
				
			||||||
  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
					  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
				
			||||||
  %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
					  %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
				
			||||||
| 
						 | 
					@ -649,10 +648,10 @@ define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask3(<2 x i32>* %vp, <16 x i
 | 
				
			||||||
define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask3(<2 x i32>* %vp, <16 x i32> %mask) {
 | 
					define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask3(<2 x i32>* %vp, <16 x i32> %mask) {
 | 
				
			||||||
; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask3:
 | 
					; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask3:
 | 
				
			||||||
; CHECK:       # %bb.0:
 | 
					; CHECK:       # %bb.0:
 | 
				
			||||||
; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 | 
					; CHECK-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 | 
				
			||||||
; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 | 
					; CHECK-NEXT:    vpbroadcastq %xmm1, %zmm1
 | 
				
			||||||
; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 | 
					; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 | 
				
			||||||
; CHECK-NEXT:    vpermd %zmm1, %zmm2, %zmm0 {%k1} {z}
 | 
					; CHECK-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
 | 
				
			||||||
; CHECK-NEXT:    retq
 | 
					; CHECK-NEXT:    retq
 | 
				
			||||||
  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
					  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
				
			||||||
  %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
					  %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3852,11 +3852,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %v
 | 
				
			||||||
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
 | 
					; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
 | 
				
			||||||
; CHECK:       # %bb.0:
 | 
					; CHECK:       # %bb.0:
 | 
				
			||||||
; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
 | 
					; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
 | 
				
			||||||
; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
 | 
					; CHECK-NEXT:    vmovapd {{.*#+}} ymm4 = [1,1,5,5]
 | 
				
			||||||
; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 | 
					; CHECK-NEXT:    vpermi2pd %ymm3, %ymm0, %ymm4
 | 
				
			||||||
; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 | 
					; CHECK-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
 | 
				
			||||||
; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
 | 
					; CHECK-NEXT:    vcmpeqpd %ymm0, %ymm2, %k1
 | 
				
			||||||
; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 | 
					; CHECK-NEXT:    vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
 | 
				
			||||||
; CHECK-NEXT:    retq
 | 
					; CHECK-NEXT:    retq
 | 
				
			||||||
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
 | 
					  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
 | 
				
			||||||
  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
 | 
					  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
 | 
				
			||||||
| 
						 | 
					@ -3867,11 +3867,12 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %v
 | 
				
			||||||
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
 | 
					define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
 | 
				
			||||||
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
 | 
					; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
 | 
				
			||||||
; CHECK:       # %bb.0:
 | 
					; CHECK:       # %bb.0:
 | 
				
			||||||
; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
 | 
					; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
 | 
				
			||||||
; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
 | 
					; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [1,1,5,5]
 | 
				
			||||||
; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 | 
					; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 | 
				
			||||||
; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
 | 
					; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm1, %k1
 | 
				
			||||||
; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
 | 
					; CHECK-NEXT:    vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
 | 
				
			||||||
 | 
					; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 | 
				
			||||||
; CHECK-NEXT:    retq
 | 
					; CHECK-NEXT:    retq
 | 
				
			||||||
  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
 | 
					  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
 | 
				
			||||||
  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
 | 
					  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1185,7 +1185,8 @@ define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) {
 | 
				
			||||||
; AVX1-NEXT:    vpsrad $4, %xmm1, %xmm2
 | 
					; AVX1-NEXT:    vpsrad $4, %xmm1, %xmm2
 | 
				
			||||||
; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm3
 | 
					; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm3
 | 
				
			||||||
; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
 | 
					; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
 | 
				
			||||||
; AVX1-NEXT:    vpsrad $3, %xmm1, %xmm1
 | 
					; AVX1-NEXT:    vpsrad $3, %xmm1, %xmm3
 | 
				
			||||||
 | 
					; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
 | 
				
			||||||
; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 | 
					; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 | 
				
			||||||
; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
 | 
					; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
 | 
				
			||||||
; AVX1-NEXT:    vpsrld $28, %xmm2, %xmm3
 | 
					; AVX1-NEXT:    vpsrld $28, %xmm2, %xmm3
 | 
				
			||||||
| 
						 | 
					@ -1404,7 +1405,8 @@ define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
 | 
				
			||||||
; AVX1-NEXT:    vpsrad $4, %xmm2, %xmm3
 | 
					; AVX1-NEXT:    vpsrad $4, %xmm2, %xmm3
 | 
				
			||||||
; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm4
 | 
					; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm4
 | 
				
			||||||
; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
 | 
					; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
 | 
				
			||||||
; AVX1-NEXT:    vpsrad $3, %xmm2, %xmm2
 | 
					; AVX1-NEXT:    vpsrad $3, %xmm2, %xmm4
 | 
				
			||||||
 | 
					; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
 | 
				
			||||||
; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
 | 
					; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
 | 
				
			||||||
; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm3
 | 
					; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm3
 | 
				
			||||||
; AVX1-NEXT:    vpsrld $28, %xmm3, %xmm4
 | 
					; AVX1-NEXT:    vpsrld $28, %xmm3, %xmm4
 | 
				
			||||||
| 
						 | 
					@ -1431,7 +1433,8 @@ define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
 | 
				
			||||||
; AVX1-NEXT:    vpsrad $4, %xmm2, %xmm3
 | 
					; AVX1-NEXT:    vpsrad $4, %xmm2, %xmm3
 | 
				
			||||||
; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm4
 | 
					; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm4
 | 
				
			||||||
; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
 | 
					; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
 | 
				
			||||||
; AVX1-NEXT:    vpsrad $3, %xmm2, %xmm2
 | 
					; AVX1-NEXT:    vpsrad $3, %xmm2, %xmm4
 | 
				
			||||||
 | 
					; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
 | 
				
			||||||
; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
 | 
					; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
 | 
				
			||||||
; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm3
 | 
					; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm3
 | 
				
			||||||
; AVX1-NEXT:    vpsrld $28, %xmm3, %xmm4
 | 
					; AVX1-NEXT:    vpsrld $28, %xmm3, %xmm4
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1598,9 +1598,8 @@ define <16 x i32> @test_2xi32_to_16xi32_mem(<2 x i32>* %vp) {
 | 
				
			||||||
; X32-AVX512-LABEL: test_2xi32_to_16xi32_mem:
 | 
					; X32-AVX512-LABEL: test_2xi32_to_16xi32_mem:
 | 
				
			||||||
; X32-AVX512:       # %bb.0:
 | 
					; X32-AVX512:       # %bb.0:
 | 
				
			||||||
; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 | 
					; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
 | 
				
			||||||
; X32-AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
 | 
					; X32-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 | 
				
			||||||
; X32-AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 | 
					; X32-AVX512-NEXT:    vbroadcastsd %xmm0, %zmm0
 | 
				
			||||||
; X32-AVX512-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 | 
					 | 
				
			||||||
; X32-AVX512-NEXT:    retl
 | 
					; X32-AVX512-NEXT:    retl
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; X64-AVX-LABEL: test_2xi32_to_16xi32_mem:
 | 
					; X64-AVX-LABEL: test_2xi32_to_16xi32_mem:
 | 
				
			||||||
| 
						 | 
					@ -1611,9 +1610,8 @@ define <16 x i32> @test_2xi32_to_16xi32_mem(<2 x i32>* %vp) {
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; X64-AVX512-LABEL: test_2xi32_to_16xi32_mem:
 | 
					; X64-AVX512-LABEL: test_2xi32_to_16xi32_mem:
 | 
				
			||||||
; X64-AVX512:       # %bb.0:
 | 
					; X64-AVX512:       # %bb.0:
 | 
				
			||||||
; X64-AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
 | 
					; X64-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 | 
				
			||||||
; X64-AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 | 
					; X64-AVX512-NEXT:    vbroadcastsd %xmm0, %zmm0
 | 
				
			||||||
; X64-AVX512-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 | 
					 | 
				
			||||||
; X64-AVX512-NEXT:    retq
 | 
					; X64-AVX512-NEXT:    retq
 | 
				
			||||||
  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
					  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
				
			||||||
  %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
					  %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2111,9 +2111,8 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 | 
				
			||||||
; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
 | 
					; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
					 | 
				
			||||||
; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | 
					; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | 
				
			||||||
| 
						 | 
					@ -2210,9 +2209,8 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 | 
				
			||||||
; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
 | 
					; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
					 | 
				
			||||||
; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | 
					; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | 
				
			||||||
| 
						 | 
					@ -2253,9 +2251,8 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -2490,9 +2487,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 | 
				
			||||||
; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					 | 
				
			||||||
; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -2543,9 +2539,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 | 
				
			||||||
; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 | 
				
			||||||
; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 | 
					; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 | 
				
			||||||
; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 | 
					 | 
				
			||||||
; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -2596,9 +2591,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 | 
					; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 | 
					 | 
				
			||||||
; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -2648,9 +2642,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 | 
				
			||||||
; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					 | 
				
			||||||
; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -2700,9 +2693,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -3073,9 +3065,8 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 | 
				
			||||||
; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					 | 
				
			||||||
; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -3135,9 +3126,8 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 | 
				
			||||||
; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 | 
				
			||||||
; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 | 
					; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 | 
				
			||||||
; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 | 
					 | 
				
			||||||
; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -3197,9 +3187,8 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 | 
					; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 | 
					 | 
				
			||||||
; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -3267,9 +3256,8 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 | 
				
			||||||
; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					 | 
				
			||||||
; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -3301,66 +3289,65 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 | 
				
			||||||
; AVX512DQVL:       # %bb.0:
 | 
					; AVX512DQVL:       # %bb.0:
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm4, %ymm5, %ymm5
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
 | 
				
			||||||
; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 | 
					; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm5, %ymm5
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm4, %ymm4
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm5, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm5, %ymm2
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm4, %ymm2
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm2, %ymm2
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm2, %ymm2
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm1, %ymm1
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm1, %ymm1
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm2, %ymm2
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm2, %ymm2
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 | 
					; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm2, %ymm2
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm2, %ymm2
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
 | 
					; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm2
 | 
					; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm2
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm2
 | 
					; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm2
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
 | 
					; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
 | 
				
			||||||
; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
 | 
					; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2066,9 +2066,8 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 | 
				
			||||||
; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
 | 
					; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
					 | 
				
			||||||
; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | 
					; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | 
				
			||||||
| 
						 | 
					@ -2165,9 +2164,8 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 | 
				
			||||||
; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
 | 
					; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
					 | 
				
			||||||
; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | 
					; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | 
				
			||||||
| 
						 | 
					@ -2208,9 +2206,8 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -2445,9 +2442,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 | 
				
			||||||
; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					 | 
				
			||||||
; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -2498,9 +2494,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 | 
				
			||||||
; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 | 
				
			||||||
; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 | 
					; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 | 
				
			||||||
; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 | 
					 | 
				
			||||||
; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -2551,9 +2546,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 | 
					; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 | 
					 | 
				
			||||||
; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -2603,9 +2597,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 | 
				
			||||||
; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					 | 
				
			||||||
; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -2655,9 +2648,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -3028,9 +3020,8 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 | 
				
			||||||
; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					 | 
				
			||||||
; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
				
			||||||
; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -3090,9 +3081,8 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 | 
				
			||||||
; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 | 
				
			||||||
; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 | 
					; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 | 
				
			||||||
; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 | 
					 | 
				
			||||||
; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
					; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -3152,9 +3142,8 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 | 
					; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
 | 
					 | 
				
			||||||
; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
					; AVX512BWVL-NEXT:    vpandq %zmm3, %zmm0, %zmm0
 | 
				
			||||||
; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -3222,9 +3211,8 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 | 
				
			||||||
; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					 | 
				
			||||||
; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX512DQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
| 
						 | 
					@ -3256,66 +3244,65 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 | 
				
			||||||
; AVX512DQVL:       # %bb.0:
 | 
					; AVX512DQVL:       # %bb.0:
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm4, %ymm5, %ymm5
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
 | 
				
			||||||
; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 | 
					; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm5, %ymm5
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm4, %ymm4
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm5, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm5, %ymm2
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm4, %ymm2
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm2, %ymm2
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm2, %ymm2
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm1, %ymm1
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm1, %ymm1
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm2, %ymm2
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm2, %ymm2
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 | 
					; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm2, %ymm2
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm2, %ymm2
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 | 
					; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
 | 
					 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
					; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
 | 
					; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm2
 | 
					; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm2
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm2
 | 
					; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm2
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
					; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpand %ymm4, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpand %ymm5, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 | 
					; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
 | 
				
			||||||
; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
 | 
					; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
 | 
				
			||||||
; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
 | 
					; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -40,8 +40,9 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
 | 
					; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
 | 
				
			||||||
; AVX2-FAST:       # %bb.0:
 | 
					; AVX2-FAST:       # %bb.0:
 | 
				
			||||||
; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
 | 
					; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
 | 
				
			||||||
; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
 | 
					; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 | 
				
			||||||
; AVX2-FAST-NEXT:    retq
 | 
					; AVX2-FAST-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
 | 
					; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
 | 
				
			||||||
| 
						 | 
					@ -53,8 +54,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
 | 
					; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
 | 
				
			||||||
; AVX512VL-FAST:       # %bb.0:
 | 
					; AVX512VL-FAST:       # %bb.0:
 | 
				
			||||||
; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
 | 
					; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
 | 
				
			||||||
; AVX512VL-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
 | 
					; AVX512VL-FAST-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 | 
				
			||||||
; AVX512VL-FAST-NEXT:    retq
 | 
					; AVX512VL-FAST-NEXT:    retq
 | 
				
			||||||
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
 | 
					  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
 | 
				
			||||||
  ret <16 x i16> %shuffle
 | 
					  ret <16 x i16> %shuffle
 | 
				
			||||||
| 
						 | 
					@ -79,8 +80,9 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
 | 
					; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
 | 
				
			||||||
; AVX2-FAST:       # %bb.0:
 | 
					; AVX2-FAST:       # %bb.0:
 | 
				
			||||||
; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
 | 
					; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
 | 
				
			||||||
; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
 | 
					; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 | 
				
			||||||
; AVX2-FAST-NEXT:    retq
 | 
					; AVX2-FAST-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
 | 
					; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
 | 
				
			||||||
| 
						 | 
					@ -92,8 +94,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
 | 
					; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
 | 
				
			||||||
; AVX512VL-FAST:       # %bb.0:
 | 
					; AVX512VL-FAST:       # %bb.0:
 | 
				
			||||||
; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
 | 
					; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
 | 
				
			||||||
; AVX512VL-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
 | 
					; AVX512VL-FAST-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 | 
				
			||||||
; AVX512VL-FAST-NEXT:    retq
 | 
					; AVX512VL-FAST-NEXT:    retq
 | 
				
			||||||
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
 | 
					  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
 | 
				
			||||||
  ret <16 x i16> %shuffle
 | 
					  ret <16 x i16> %shuffle
 | 
				
			||||||
| 
						 | 
					@ -118,8 +120,9 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
 | 
					; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
 | 
				
			||||||
; AVX2-FAST:       # %bb.0:
 | 
					; AVX2-FAST:       # %bb.0:
 | 
				
			||||||
; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
 | 
					; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
 | 
				
			||||||
; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
 | 
					; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 | 
				
			||||||
; AVX2-FAST-NEXT:    retq
 | 
					; AVX2-FAST-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
 | 
					; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
 | 
				
			||||||
| 
						 | 
					@ -131,8 +134,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
 | 
					; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
 | 
				
			||||||
; AVX512VL-FAST:       # %bb.0:
 | 
					; AVX512VL-FAST:       # %bb.0:
 | 
				
			||||||
; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
 | 
					; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0]
 | 
				
			||||||
; AVX512VL-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
 | 
					; AVX512VL-FAST-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 | 
				
			||||||
; AVX512VL-FAST-NEXT:    retq
 | 
					; AVX512VL-FAST-NEXT:    retq
 | 
				
			||||||
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
 | 
					  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
 | 
				
			||||||
  ret <16 x i16> %shuffle
 | 
					  ret <16 x i16> %shuffle
 | 
				
			||||||
| 
						 | 
					@ -147,11 +150,33 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0
 | 
				
			||||||
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 | 
					; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 | 
				
			||||||
; AVX1-NEXT:    retq
 | 
					; AVX1-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
 | 
					; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
 | 
				
			||||||
; AVX2OR512VL:       # %bb.0:
 | 
					; AVX2-SLOW:       # %bb.0:
 | 
				
			||||||
; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9]
 | 
					; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
 | 
				
			||||||
; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
 | 
					; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 | 
				
			||||||
; AVX2OR512VL-NEXT:    retq
 | 
					; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,4]
 | 
				
			||||||
 | 
					; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
 | 
				
			||||||
 | 
					; AVX2-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX2-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX512VL-SLOW:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,4]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX512VL-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0]
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    retq
 | 
				
			||||||
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
 | 
					  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
 | 
				
			||||||
  ret <16 x i16> %shuffle
 | 
					  ret <16 x i16> %shuffle
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -165,11 +190,33 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0
 | 
				
			||||||
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 | 
					; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 | 
				
			||||||
; AVX1-NEXT:    retq
 | 
					; AVX1-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
 | 
					; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
 | 
				
			||||||
; AVX2OR512VL:       # %bb.0:
 | 
					; AVX2-SLOW:       # %bb.0:
 | 
				
			||||||
; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1]
 | 
					; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
 | 
				
			||||||
; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
 | 
					; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 | 
				
			||||||
; AVX2OR512VL-NEXT:    retq
 | 
					; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,5,6]
 | 
				
			||||||
 | 
					; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
 | 
				
			||||||
 | 
					; AVX2-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX2-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX512VL-SLOW:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,5,6]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX512VL-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0]
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    retq
 | 
				
			||||||
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
 | 
					  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
 | 
				
			||||||
  ret <16 x i16> %shuffle
 | 
					  ret <16 x i16> %shuffle
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -183,11 +230,33 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0
 | 
				
			||||||
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 | 
					; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 | 
				
			||||||
; AVX1-NEXT:    retq
 | 
					; AVX1-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
 | 
					; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
 | 
				
			||||||
; AVX2OR512VL:       # %bb.0:
 | 
					; AVX2-SLOW:       # %bb.0:
 | 
				
			||||||
; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1]
 | 
					; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
 | 
				
			||||||
; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
 | 
					; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 | 
				
			||||||
; AVX2OR512VL-NEXT:    retq
 | 
					; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,4]
 | 
				
			||||||
 | 
					; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
 | 
				
			||||||
 | 
					; AVX2-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX2-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX512VL-SLOW:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,4]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX512VL-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    retq
 | 
				
			||||||
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 | 
					  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 | 
				
			||||||
  ret <16 x i16> %shuffle
 | 
					  ret <16 x i16> %shuffle
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -201,11 +270,33 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0
 | 
				
			||||||
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 | 
					; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 | 
				
			||||||
; AVX1-NEXT:    retq
 | 
					; AVX1-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
 | 
					; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
 | 
				
			||||||
; AVX2OR512VL:       # %bb.0:
 | 
					; AVX2-SLOW:       # %bb.0:
 | 
				
			||||||
; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
 | 
					; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
 | 
				
			||||||
; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
 | 
					; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 | 
				
			||||||
; AVX2OR512VL-NEXT:    retq
 | 
					; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,4,4]
 | 
				
			||||||
 | 
					; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
 | 
				
			||||||
 | 
					; AVX2-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX2-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX512VL-SLOW:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,4,4]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX512VL-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    retq
 | 
				
			||||||
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 | 
					  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 | 
				
			||||||
  ret <16 x i16> %shuffle
 | 
					  ret <16 x i16> %shuffle
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -685,8 +776,9 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_0
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
 | 
					; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
 | 
				
			||||||
; AVX2-FAST:       # %bb.0:
 | 
					; AVX2-FAST:       # %bb.0:
 | 
				
			||||||
; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
 | 
					; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
 | 
				
			||||||
; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
					; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,1,0,0,0,1]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 | 
				
			||||||
; AVX2-FAST-NEXT:    retq
 | 
					; AVX2-FAST-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
 | 
					; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
 | 
				
			||||||
| 
						 | 
					@ -698,8 +790,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_0
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
 | 
					; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
 | 
				
			||||||
; AVX512VL-FAST:       # %bb.0:
 | 
					; AVX512VL-FAST:       # %bb.0:
 | 
				
			||||||
; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
 | 
					; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]
 | 
				
			||||||
; AVX512VL-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
					; AVX512VL-FAST-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 | 
				
			||||||
; AVX512VL-FAST-NEXT:    retq
 | 
					; AVX512VL-FAST-NEXT:    retq
 | 
				
			||||||
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
 | 
					  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
 | 
				
			||||||
  ret <16 x i16> %shuffle
 | 
					  ret <16 x i16> %shuffle
 | 
				
			||||||
| 
						 | 
					@ -722,8 +814,9 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_0
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
 | 
					; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
 | 
				
			||||||
; AVX2-FAST:       # %bb.0:
 | 
					; AVX2-FAST:       # %bb.0:
 | 
				
			||||||
; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
 | 
					; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
 | 
				
			||||||
; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
					; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 | 
				
			||||||
; AVX2-FAST-NEXT:    retq
 | 
					; AVX2-FAST-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
 | 
					; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
 | 
				
			||||||
| 
						 | 
					@ -735,8 +828,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_0
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
 | 
					; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
 | 
				
			||||||
; AVX512VL-FAST:       # %bb.0:
 | 
					; AVX512VL-FAST:       # %bb.0:
 | 
				
			||||||
; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
 | 
					; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0]
 | 
				
			||||||
; AVX512VL-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
					; AVX512VL-FAST-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 | 
				
			||||||
; AVX512VL-FAST-NEXT:    retq
 | 
					; AVX512VL-FAST-NEXT:    retq
 | 
				
			||||||
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
 | 
					  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
 | 
				
			||||||
  ret <16 x i16> %shuffle
 | 
					  ret <16 x i16> %shuffle
 | 
				
			||||||
| 
						 | 
					@ -759,8 +852,9 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_0
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
 | 
					; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
 | 
				
			||||||
; AVX2-FAST:       # %bb.0:
 | 
					; AVX2-FAST:       # %bb.0:
 | 
				
			||||||
; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
 | 
					; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
 | 
				
			||||||
; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
					; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 | 
				
			||||||
; AVX2-FAST-NEXT:    retq
 | 
					; AVX2-FAST-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
 | 
					; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
 | 
				
			||||||
| 
						 | 
					@ -772,8 +866,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_0
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
 | 
					; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
 | 
				
			||||||
; AVX512VL-FAST:       # %bb.0:
 | 
					; AVX512VL-FAST:       # %bb.0:
 | 
				
			||||||
; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
 | 
					; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0]
 | 
				
			||||||
; AVX512VL-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
					; AVX512VL-FAST-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 | 
				
			||||||
; AVX512VL-FAST-NEXT:    retq
 | 
					; AVX512VL-FAST-NEXT:    retq
 | 
				
			||||||
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
 | 
					  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
 | 
				
			||||||
  ret <16 x i16> %shuffle
 | 
					  ret <16 x i16> %shuffle
 | 
				
			||||||
| 
						 | 
					@ -786,11 +880,33 @@ define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_0
 | 
				
			||||||
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 | 
					; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 | 
				
			||||||
; AVX1-NEXT:    retq
 | 
					; AVX1-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
 | 
					; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
 | 
				
			||||||
; AVX2OR512VL:       # %bb.0:
 | 
					; AVX2-SLOW:       # %bb.0:
 | 
				
			||||||
; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
 | 
					; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
 | 
				
			||||||
; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
					; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
 | 
				
			||||||
; AVX2OR512VL-NEXT:    retq
 | 
					; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
 | 
				
			||||||
 | 
					; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
				
			||||||
 | 
					; AVX2-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX2-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX512VL-SLOW:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX512VL-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0,0,0,0,4,0,0,0,0]
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    retq
 | 
				
			||||||
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
 | 
					  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
 | 
				
			||||||
  ret <16 x i16> %shuffle
 | 
					  ret <16 x i16> %shuffle
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -802,11 +918,33 @@ define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_0
 | 
				
			||||||
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 | 
					; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 | 
				
			||||||
; AVX1-NEXT:    retq
 | 
					; AVX1-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
 | 
					; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
 | 
				
			||||||
; AVX2OR512VL:       # %bb.0:
 | 
					; AVX2-SLOW:       # %bb.0:
 | 
				
			||||||
; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
 | 
					; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
 | 
				
			||||||
; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
					; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
 | 
				
			||||||
; AVX2OR512VL-NEXT:    retq
 | 
					; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
 | 
				
			||||||
 | 
					; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
				
			||||||
 | 
					; AVX2-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX2-FAST-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX2-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX512VL-SLOW:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX512VL-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0,0,0,5,0,0,0,0,0]
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    retq
 | 
				
			||||||
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
 | 
					  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
 | 
				
			||||||
  ret <16 x i16> %shuffle
 | 
					  ret <16 x i16> %shuffle
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -818,11 +956,33 @@ define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_0
 | 
				
			||||||
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 | 
					; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 | 
				
			||||||
; AVX1-NEXT:    retq
 | 
					; AVX1-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2OR512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
 | 
					; AVX2-SLOW-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
 | 
				
			||||||
; AVX2OR512VL:       # %bb.0:
 | 
					; AVX2-SLOW:       # %bb.0:
 | 
				
			||||||
; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
 | 
					; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,3]
 | 
				
			||||||
; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
					; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,0,4,5,6,7]
 | 
				
			||||||
; AVX2OR512VL-NEXT:    retq
 | 
					; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
 | 
				
			||||||
 | 
					; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
				
			||||||
 | 
					; AVX2-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX2-FAST-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX2-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX512VL-SLOW:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,3]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,0,4,5,6,7]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX512VL-FAST-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX512VL-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    retq
 | 
				
			||||||
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 | 
					  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 | 
				
			||||||
  ret <16 x i16> %shuffle
 | 
					  ret <16 x i16> %shuffle
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -834,11 +994,33 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0
 | 
				
			||||||
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 | 
					; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 | 
				
			||||||
; AVX1-NEXT:    retq
 | 
					; AVX1-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2OR512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
 | 
					; AVX2-SLOW-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
 | 
				
			||||||
; AVX2OR512VL:       # %bb.0:
 | 
					; AVX2-SLOW:       # %bb.0:
 | 
				
			||||||
; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 | 
					; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,3]
 | 
				
			||||||
; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
					; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,0,0,0,4,5,6,7]
 | 
				
			||||||
; AVX2OR512VL-NEXT:    retq
 | 
					; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
 | 
				
			||||||
 | 
					; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
				
			||||||
 | 
					; AVX2-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX2-FAST-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX2-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX512VL-SLOW:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,3]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,0,0,0,4,5,6,7]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 | 
				
			||||||
 | 
					; AVX512VL-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX512VL-FAST-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
 | 
				
			||||||
 | 
					; AVX512VL-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 | 
				
			||||||
 | 
					; AVX512VL-FAST-NEXT:    retq
 | 
				
			||||||
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 | 
					  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 | 
				
			||||||
  ret <16 x i16> %shuffle
 | 
					  ret <16 x i16> %shuffle
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -4490,17 +4672,31 @@ define <16 x i16> @PR34369(<16 x i16> %vec, <16 x i16> %mask) {
 | 
				
			||||||
; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm0
 | 
					; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm0
 | 
				
			||||||
; AVX1-NEXT:    retq
 | 
					; AVX1-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX2-LABEL: PR34369:
 | 
					; AVX2-SLOW-LABEL: PR34369:
 | 
				
			||||||
; AVX2:       # %bb.0:
 | 
					; AVX2-SLOW:       # %bb.0:
 | 
				
			||||||
; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
 | 
					; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
 | 
				
			||||||
; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,4,5,10,11,8,9,10,11,4,5,4,5]
 | 
					; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
 | 
				
			||||||
; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
 | 
					; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
 | 
				
			||||||
; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7],ymm0[8,9,10],ymm2[11],ymm0[12,13,14],ymm2[15]
 | 
					; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6]
 | 
				
			||||||
; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 | 
					; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
 | 
				
			||||||
; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | 
					; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7],ymm0[8,9,10],ymm2[11],ymm0[12,13,14],ymm2[15]
 | 
				
			||||||
; AVX2-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm1
 | 
					; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 | 
				
			||||||
; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
 | 
					; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | 
				
			||||||
; AVX2-NEXT:    retq
 | 
					; AVX2-SLOW-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm1
 | 
				
			||||||
 | 
					; AVX2-SLOW-NEXT:    vpand %ymm0, %ymm1, %ymm0
 | 
				
			||||||
 | 
					; AVX2-SLOW-NEXT:    retq
 | 
				
			||||||
 | 
					;
 | 
				
			||||||
 | 
					; AVX2-FAST-LABEL: PR34369:
 | 
				
			||||||
 | 
					; AVX2-FAST:       # %bb.0:
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,4,5,10,11,8,9,10,11,4,5,4,5]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7],ymm0[8,9,10],ymm2[11],ymm0[12,13,14],ymm2[15]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm1
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    vpand %ymm0, %ymm1, %ymm0
 | 
				
			||||||
 | 
					; AVX2-FAST-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512VL-LABEL: PR34369:
 | 
					; AVX512VL-LABEL: PR34369:
 | 
				
			||||||
; AVX512VL:       # %bb.0:
 | 
					; AVX512VL:       # %bb.0:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1553,19 +1553,12 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
 | 
				
			||||||
; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
 | 
					; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
 | 
				
			||||||
; AVX2-NEXT:    retq
 | 
					; AVX2-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512VL-SLOW-LABEL: shuffle_v8i32_08991abb:
 | 
					; AVX512VL-LABEL: shuffle_v8i32_08991abb:
 | 
				
			||||||
; AVX512VL-SLOW:       # %bb.0:
 | 
					; AVX512VL:       # %bb.0:
 | 
				
			||||||
; AVX512VL-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
 | 
					; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,0,1,1,9,2,3,3]
 | 
				
			||||||
; AVX512VL-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [8,0,1,1,10,2,3,3]
 | 
					; AVX512VL-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
 | 
				
			||||||
; AVX512VL-SLOW-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
 | 
					; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
 | 
				
			||||||
; AVX512VL-SLOW-NEXT:    retq
 | 
					; AVX512VL-NEXT:    retq
 | 
				
			||||||
;
 | 
					 | 
				
			||||||
; AVX512VL-FAST-LABEL: shuffle_v8i32_08991abb:
 | 
					 | 
				
			||||||
; AVX512VL-FAST:       # %bb.0:
 | 
					 | 
				
			||||||
; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,0,1,1,9,2,3,3]
 | 
					 | 
				
			||||||
; AVX512VL-FAST-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
 | 
					 | 
				
			||||||
; AVX512VL-FAST-NEXT:    vmovdqa %ymm2, %ymm0
 | 
					 | 
				
			||||||
; AVX512VL-FAST-NEXT:    retq
 | 
					 | 
				
			||||||
  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
 | 
					  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
 | 
				
			||||||
  ret <8 x i32> %shuffle
 | 
					  ret <8 x i32> %shuffle
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -1603,19 +1596,12 @@ define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) {
 | 
				
			||||||
; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
 | 
					; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
 | 
				
			||||||
; AVX2-NEXT:    retq
 | 
					; AVX2-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512VL-SLOW-LABEL: shuffle_v8i32_09ab1def:
 | 
					; AVX512VL-LABEL: shuffle_v8i32_09ab1def:
 | 
				
			||||||
; AVX512VL-SLOW:       # %bb.0:
 | 
					; AVX512VL:       # %bb.0:
 | 
				
			||||||
; AVX512VL-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 | 
					; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,1,2,3,9,5,6,7]
 | 
				
			||||||
; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
 | 
					; AVX512VL-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
 | 
				
			||||||
; AVX512VL-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
 | 
					; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
 | 
				
			||||||
; AVX512VL-SLOW-NEXT:    retq
 | 
					; AVX512VL-NEXT:    retq
 | 
				
			||||||
;
 | 
					 | 
				
			||||||
; AVX512VL-FAST-LABEL: shuffle_v8i32_09ab1def:
 | 
					 | 
				
			||||||
; AVX512VL-FAST:       # %bb.0:
 | 
					 | 
				
			||||||
; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,1,2,3,9,5,6,7]
 | 
					 | 
				
			||||||
; AVX512VL-FAST-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
 | 
					 | 
				
			||||||
; AVX512VL-FAST-NEXT:    vmovdqa %ymm2, %ymm0
 | 
					 | 
				
			||||||
; AVX512VL-FAST-NEXT:    retq
 | 
					 | 
				
			||||||
  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
 | 
					  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
 | 
				
			||||||
  ret <8 x i32> %shuffle
 | 
					  ret <8 x i32> %shuffle
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -541,8 +541,9 @@ define <8 x float> @expand14(<4 x float> %a) {
 | 
				
			||||||
define <8 x float> @expand15(<4 x float> %a) {
 | 
					define <8 x float> @expand15(<4 x float> %a) {
 | 
				
			||||||
; SKX64-LABEL: expand15:
 | 
					; SKX64-LABEL: expand15:
 | 
				
			||||||
; SKX64:       # %bb.0:
 | 
					; SKX64:       # %bb.0:
 | 
				
			||||||
; SKX64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 | 
					; SKX64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 | 
				
			||||||
; SKX64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
 | 
					; SKX64-NEXT:    vmovaps {{.*#+}} ymm1 = <0,1,0,1,1,3,u,u>
 | 
				
			||||||
 | 
					; SKX64-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 | 
				
			||||||
; SKX64-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7]
 | 
					; SKX64-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7]
 | 
				
			||||||
; SKX64-NEXT:    retq
 | 
					; SKX64-NEXT:    retq
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
| 
						 | 
					@ -555,8 +556,9 @@ define <8 x float> @expand15(<4 x float> %a) {
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; SKX32-LABEL: expand15:
 | 
					; SKX32-LABEL: expand15:
 | 
				
			||||||
; SKX32:       # %bb.0:
 | 
					; SKX32:       # %bb.0:
 | 
				
			||||||
; SKX32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 | 
					; SKX32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 | 
				
			||||||
; SKX32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
 | 
					; SKX32-NEXT:    vmovaps {{.*#+}} ymm1 = <0,1,0,1,1,3,u,u>
 | 
				
			||||||
 | 
					; SKX32-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 | 
				
			||||||
; SKX32-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7]
 | 
					; SKX32-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7]
 | 
				
			||||||
; SKX32-NEXT:    retl
 | 
					; SKX32-NEXT:    retl
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -630,9 +630,8 @@ define <16 x i32> @load_splat_16i32_2i32_0101(<2 x i32>* %vp) {
 | 
				
			||||||
;
 | 
					;
 | 
				
			||||||
; AVX512-LABEL: load_splat_16i32_2i32_0101:
 | 
					; AVX512-LABEL: load_splat_16i32_2i32_0101:
 | 
				
			||||||
; AVX512:       # %bb.0:
 | 
					; AVX512:       # %bb.0:
 | 
				
			||||||
; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
 | 
					; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 | 
				
			||||||
; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 | 
					; AVX512-NEXT:    vbroadcastsd %xmm0, %zmm0
 | 
				
			||||||
; AVX512-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 | 
					 | 
				
			||||||
; AVX512-NEXT:    retq
 | 
					; AVX512-NEXT:    retq
 | 
				
			||||||
  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
					  %vec = load <2 x i32>, <2 x i32>* %vp
 | 
				
			||||||
  %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
					  %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue