forked from OSchip/llvm-project
[X86] Turn v16i16->v16i8 truncate+store into a any_extend+truncstore if we avx512f, but not avx512bw.
Ideally we'd be able to represent this truncate as a any_extend to v16i32 and a truncate, but SelectionDAG doens't know how to not fold those together. We have isel patterns to use a vpmovzxwd+vpdmovdb for the truncate, but we aren't able to simultaneously fold the load and the store from the isel pattern. By pulling the truncate into the store we can successfully hide it from the DAG combiner. Then we can isel pattern match the truncstore and load+any_extend separately. llvm-svn: 364163
This commit is contained in:
parent
c6094f0495
commit
e8da65c698
|
|
@ -39656,6 +39656,7 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
|
|||
}
|
||||
|
||||
static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
const X86Subtarget &Subtarget) {
|
||||
StoreSDNode *St = cast<StoreSDNode>(N);
|
||||
EVT VT = St->getValue().getValueType();
|
||||
|
|
@ -39767,6 +39768,18 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
|
|||
}
|
||||
}
|
||||
|
||||
// Try to optimize v16i16->v16i8 truncating stores when BWI is not
|
||||
// supported, but avx512f is by extending to v16i32 and truncating.
|
||||
if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
|
||||
St->getValue().getOpcode() == ISD::TRUNCATE &&
|
||||
St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
|
||||
TLI.isTruncStoreLegalOrCustom(MVT::v16i32, MVT::v16i8) &&
|
||||
!DCI.isBeforeLegalizeOps()) {
|
||||
SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
|
||||
return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
|
||||
MVT::v16i8, St->getMemOperand());
|
||||
}
|
||||
|
||||
// Optimize trunc store (of multiple scalars) to shuffle and store.
|
||||
// First, pack all of the elements in one place. Next, store to memory
|
||||
// in fewer chunks.
|
||||
|
|
@ -43774,7 +43787,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
|
|||
case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
|
||||
case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
|
||||
case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
|
||||
case ISD::STORE: return combineStore(N, DAG, Subtarget);
|
||||
case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
|
||||
case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
|
||||
case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
|
||||
case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
|
||||
|
|
|
|||
|
|
@ -9799,8 +9799,6 @@ def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))),
|
|||
(VPMOVDBZrr (v16i32 (VPMOVZXWDZrr VR256X:$src)))>;
|
||||
def: Pat<(v16i8 (trunc (loadv16i16 addr:$src))),
|
||||
(VPMOVDBZrr (v16i32 (VPMOVZXWDZrm addr:$src)))>;
|
||||
def: Pat<(store (v16i8 (trunc (v16i16 VR256X:$src))), addr:$dst),
|
||||
(VPMOVDBZmr addr:$dst, (v16i32 (VPMOVZXWDZrr VR256X:$src)))>;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
|||
|
|
@ -88,20 +88,18 @@ define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
|
|||
define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
|
||||
; AVX512F-LABEL: trunc_v32i16_to_v32i8:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
|
||||
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
|
||||
; AVX512F-NEXT: vpmovdb %zmm1, 16(%rsi)
|
||||
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
|
||||
; AVX512F-NEXT: vpmovdb %zmm0, (%rsi)
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
|
||||
; AVX512VL: # %bb.0:
|
||||
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
|
||||
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
|
||||
; AVX512VL-NEXT: vpmovdb %zmm1, 16(%rsi)
|
||||
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
|
||||
; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi)
|
||||
; AVX512VL-NEXT: vzeroupper
|
||||
; AVX512VL-NEXT: retq
|
||||
|
|
|
|||
|
|
@ -88,20 +88,18 @@ define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
|
|||
define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
|
||||
; AVX512F-LABEL: trunc_v32i16_to_v32i8:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
|
||||
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
|
||||
; AVX512F-NEXT: vpmovdb %zmm1, 16(%rsi)
|
||||
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
|
||||
; AVX512F-NEXT: vpmovdb %zmm0, (%rsi)
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
|
||||
; AVX512VL: # %bb.0:
|
||||
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
|
||||
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
|
||||
; AVX512VL-NEXT: vpmovdb %zmm1, 16(%rsi)
|
||||
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
|
||||
; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi)
|
||||
; AVX512VL-NEXT: vzeroupper
|
||||
; AVX512VL-NEXT: retq
|
||||
|
|
|
|||
|
|
@ -1148,7 +1148,7 @@ define void @trunc16i16_16i8_ashr(<16 x i16> %a) {
|
|||
;
|
||||
; AVX512F-LABEL: trunc16i16_16i8_ashr:
|
||||
; AVX512F: # %bb.0: # %entry
|
||||
; AVX512F-NEXT: vpsraw $8, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
|
||||
; AVX512F-NEXT: vpmovdb %zmm0, (%rax)
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
|
|
@ -1156,7 +1156,7 @@ define void @trunc16i16_16i8_ashr(<16 x i16> %a) {
|
|||
;
|
||||
; AVX512VL-LABEL: trunc16i16_16i8_ashr:
|
||||
; AVX512VL: # %bb.0: # %entry
|
||||
; AVX512VL-NEXT: vpsraw $8, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
|
||||
; AVX512VL-NEXT: vpmovdb %zmm0, (%rax)
|
||||
; AVX512VL-NEXT: vzeroupper
|
||||
|
|
|
|||
|
|
@ -1158,7 +1158,7 @@ define void @trunc16i16_16i8_ashr(<16 x i16> %a) {
|
|||
;
|
||||
; AVX512F-LABEL: trunc16i16_16i8_ashr:
|
||||
; AVX512F: # %bb.0: # %entry
|
||||
; AVX512F-NEXT: vpsraw $8, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
|
||||
; AVX512F-NEXT: vpmovdb %zmm0, (%rax)
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
|
|
@ -1166,7 +1166,7 @@ define void @trunc16i16_16i8_ashr(<16 x i16> %a) {
|
|||
;
|
||||
; AVX512VL-LABEL: trunc16i16_16i8_ashr:
|
||||
; AVX512VL: # %bb.0: # %entry
|
||||
; AVX512VL-NEXT: vpsraw $8, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
|
||||
; AVX512VL-NEXT: vpmovdb %zmm0, (%rax)
|
||||
; AVX512VL-NEXT: vzeroupper
|
||||
|
|
|
|||
Loading…
Reference in New Issue