[X86] Fold concat(pshufb(x,y),pshufb(z,w)) -> pshufb(concat(x,z),concat(y,w))
This commit is contained in:
parent
f67e3f6e8c
commit
30a01bccda
|
@ -53329,6 +53329,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
|
|||
DAG.getTargetConstant(Idx, DL, MVT::i8));
|
||||
}
|
||||
break;
|
||||
case X86ISD::PSHUFB:
|
||||
if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
|
||||
(VT.is512BitVector() && Subtarget.useBWIRegs()))) {
|
||||
return DAG.getNode(Op0.getOpcode(), DL, VT,
|
||||
ConcatSubOperand(VT, Ops, 0),
|
||||
ConcatSubOperand(VT, Ops, 1));
|
||||
}
|
||||
break;
|
||||
case X86ISD::VPERMV3:
|
||||
if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
|
||||
MVT OpVT = Op0.getSimpleValueType();
|
||||
|
@ -53464,6 +53472,23 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
|
|||
}
|
||||
}
|
||||
|
||||
// Attempt to fold target constant loads.
|
||||
if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
|
||||
SmallVector<APInt> EltBits;
|
||||
APInt UndefElts = APInt::getNullValue(VT.getVectorNumElements());
|
||||
for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
|
||||
APInt OpUndefElts;
|
||||
SmallVector<APInt> OpEltBits;
|
||||
if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
|
||||
OpEltBits, true, false))
|
||||
break;
|
||||
EltBits.append(OpEltBits);
|
||||
UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
|
||||
}
|
||||
if (EltBits.size() == VT.getVectorNumElements())
|
||||
return getConstVector(EltBits, UndefElts, VT, DAG, DL);
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
|
|
|
@ -314,21 +314,23 @@ define void @store_i8_stride3_vf16(<16 x i8>* %in.vecptr0, <16 x i8>* %in.vecptr
|
|||
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512-NEXT: vmovdqa (%rsi), %xmm1
|
||||
; AVX512-NEXT: vmovdqa (%rdx), %xmm2
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
|
||||
; AVX512-NEXT: vmovdqa %xmm0, 16(%rcx)
|
||||
; AVX512-NEXT: vmovdqa %xmm1, (%rcx)
|
||||
; AVX512-NEXT: vmovdqa %xmm2, 32(%rcx)
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
|
||||
; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
|
||||
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
|
||||
; AVX512-NEXT: # ymm6 = mem[0,1,0,1]
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
|
||||
; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1
|
||||
; AVX512-NEXT: vmovdqa %xmm0, 32(%rcx)
|
||||
; AVX512-NEXT: vmovdqa %ymm1, (%rcx)
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%in.vec0 = load <16 x i8>, <16 x i8>* %in.vecptr0, align 32
|
||||
%in.vec1 = load <16 x i8>, <16 x i8>* %in.vecptr1, align 32
|
||||
|
@ -544,13 +546,13 @@ define void @store_i8_stride3_vf32(<32 x i8>* %in.vecptr0, <32 x i8>* %in.vecptr
|
|||
; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
|
||||
; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
|
||||
; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1
|
||||
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
|
||||
; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
|
||||
; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
|
||||
; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
|
||||
; AVX512-NEXT: vpshufb %zmm2, %zmm1, %zmm1
|
||||
; AVX512-NEXT: vmovdqa %ymm0, 64(%rcx)
|
||||
; AVX512-NEXT: vmovdqu64 %zmm1, (%rcx)
|
||||
; AVX512-NEXT: vzeroupper
|
||||
|
|
|
@ -1493,13 +1493,13 @@ define <64 x i8> @PR54562_ref(<64 x i8> %a0) {
|
|||
; AVX512F-LABEL: PR54562_ref:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,2]
|
||||
; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
|
||||
; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm2
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
|
||||
; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm3
|
||||
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
|
||||
; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
|
||||
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
|
||||
; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
|
||||
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
|
||||
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
|
@ -1513,13 +1513,13 @@ define <64 x i8> @PR54562_ref(<64 x i8> %a0) {
|
|||
; AVX512DQ-LABEL: PR54562_ref:
|
||||
; AVX512DQ: # %bb.0:
|
||||
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,2]
|
||||
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
|
||||
; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
|
||||
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
|
||||
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1
|
||||
; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3
|
||||
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
|
||||
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
|
||||
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
|
||||
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
|
||||
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
|
||||
; AVX512DQ-NEXT: retq
|
||||
;
|
||||
|
@ -1538,13 +1538,13 @@ define void @PR54562_mem(<64 x i8>* %src, <64 x i8>* %dst) {
|
|||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm0
|
||||
; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
|
||||
; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
|
||||
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
|
||||
; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,1,2]
|
||||
; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
|
||||
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
|
||||
; AVX512F-NEXT: vmovdqa %xmm0, 48(%rsi)
|
||||
; AVX512F-NEXT: vmovdqa %xmm1, 32(%rsi)
|
||||
; AVX512F-NEXT: vmovdqa %ymm2, (%rsi)
|
||||
; AVX512F-NEXT: vpshufb %ymm1, %ymm2, %ymm1
|
||||
; AVX512F-NEXT: vmovdqa %ymm1, (%rsi)
|
||||
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rsi)
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
|
@ -1561,13 +1561,13 @@ define void @PR54562_mem(<64 x i8>* %src, <64 x i8>* %dst) {
|
|||
; AVX512DQ: # %bb.0:
|
||||
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0
|
||||
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
|
||||
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10]
|
||||
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
|
||||
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,1,2]
|
||||
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,21,20,22,21,24,23,25,24,27,26,28,27,30,29,31,30]
|
||||
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
|
||||
; AVX512DQ-NEXT: vmovdqa %xmm0, 48(%rsi)
|
||||
; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rsi)
|
||||
; AVX512DQ-NEXT: vmovdqa %ymm2, (%rsi)
|
||||
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1
|
||||
; AVX512DQ-NEXT: vmovdqa %ymm1, (%rsi)
|
||||
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rsi)
|
||||
; AVX512DQ-NEXT: vzeroupper
|
||||
; AVX512DQ-NEXT: retq
|
||||
;
|
||||
|
|
|
@ -972,24 +972,47 @@ ret void
|
|||
}
|
||||
|
||||
define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <48 x i8>* %p) {
|
||||
; AVX-LABEL: interleaved_store_vf16_i8_stride3:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
|
||||
; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
|
||||
; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
|
||||
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
|
||||
; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
|
||||
; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
|
||||
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
|
||||
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
|
||||
; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
|
||||
; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
|
||||
; AVX-NEXT: vmovdqu %xmm0, 16(%rdi)
|
||||
; AVX-NEXT: vmovdqu %xmm1, (%rdi)
|
||||
; AVX-NEXT: vmovdqu %xmm2, 32(%rdi)
|
||||
; AVX-NEXT: retq
|
||||
; AVX1OR2-LABEL: interleaved_store_vf16_i8_stride3:
|
||||
; AVX1OR2: # %bb.0:
|
||||
; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
|
||||
; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
|
||||
; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
|
||||
; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
|
||||
; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
|
||||
; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
|
||||
; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
|
||||
; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
|
||||
; AVX1OR2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
|
||||
; AVX1OR2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
|
||||
; AVX1OR2-NEXT: vmovdqu %xmm0, 16(%rdi)
|
||||
; AVX1OR2-NEXT: vmovdqu %xmm1, (%rdi)
|
||||
; AVX1OR2-NEXT: vmovdqu %xmm2, 32(%rdi)
|
||||
; AVX1OR2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: interleaved_store_vf16_i8_stride3:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
|
||||
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
|
||||
; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
|
||||
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
|
||||
; AVX512-NEXT: # ymm6 = mem[0,1,0,1]
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
|
||||
; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1
|
||||
; AVX512-NEXT: vmovdqu %xmm0, 32(%rdi)
|
||||
; AVX512-NEXT: vmovdqu %ymm1, (%rdi)
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
|
||||
%2 = shufflevector <16 x i8> %c, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> %2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
|
||||
|
@ -1069,13 +1092,13 @@ define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x
|
|||
; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
|
||||
; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
|
||||
; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1
|
||||
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
|
||||
; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
|
||||
; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
|
||||
; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
|
||||
; AVX512-NEXT: vpshufb %zmm2, %zmm1, %zmm1
|
||||
; AVX512-NEXT: vmovdqu %ymm0, 64(%rdi)
|
||||
; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi)
|
||||
; AVX512-NEXT: vzeroupper
|
||||
|
@ -1209,26 +1232,25 @@ define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x
|
|||
; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52]
|
||||
; AVX512-NEXT: vpalignr {{.*#+}} zmm2 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm4[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm4[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm4[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm4[48,49,50,51,52]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
|
||||
; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7]
|
||||
; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm5
|
||||
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm2[2,3]
|
||||
; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm6
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5,6,7]
|
||||
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm2[2,3]
|
||||
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0
|
||||
; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm7
|
||||
; AVX512-NEXT: vpshufb %ymm4, %ymm7, %ymm7
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm6
|
||||
; AVX512-NEXT: vextracti64x4 $1, %zmm2, %ymm2
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
|
||||
; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1
|
||||
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
|
||||
; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm2
|
||||
; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm3
|
||||
; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm2
|
||||
; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
|
||||
; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
|
||||
; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm4
|
||||
; AVX512-NEXT: vpshufb %zmm4, %zmm2, %zmm2
|
||||
; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
|
||||
; AVX512-NEXT: vpshufb %zmm4, %zmm5, %zmm4
|
||||
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
|
||||
; AVX512-NEXT: vpshufb %zmm3, %zmm0, %zmm0
|
||||
; AVX512-NEXT: vmovdqu64 %zmm0, 128(%rdi)
|
||||
; AVX512-NEXT: vmovdqu64 %zmm3, 64(%rdi)
|
||||
; AVX512-NEXT: vmovdqu64 %zmm4, 64(%rdi)
|
||||
; AVX512-NEXT: vmovdqu64 %zmm2, (%rdi)
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
|
|
Loading…
Reference in New Issue