forked from OSchip/llvm-project
[X86][AVX] createPSADBW - support 256-bit cases on AVX1 via SplitBinaryOpsAndApply
llvm-svn: 326104
This commit is contained in:
parent
a2fbcef8ee
commit
db0ed7d724
|
|
@ -31029,8 +31029,8 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
|
|||
// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
|
||||
// to these zexts.
|
||||
static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
|
||||
const SDValue &Zext1, const SDLoc &DL) {
|
||||
|
||||
const SDValue &Zext1, const SDLoc &DL,
|
||||
const X86Subtarget &Subtarget) {
|
||||
// Find the appropriate width for the PSADBW.
|
||||
EVT InVT = Zext0.getOperand(0).getValueType();
|
||||
unsigned RegSize = std::max(128u, InVT.getSizeInBits());
|
||||
|
|
@ -31045,9 +31045,15 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
|
|||
Ops[0] = Zext1.getOperand(0);
|
||||
SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
|
||||
|
||||
// Actually build the SAD
|
||||
// Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
|
||||
auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL, SDValue Op0,
|
||||
SDValue Op1) {
|
||||
MVT VT = MVT::getVectorVT(MVT::i64, Op0.getValueSizeInBits() / 64);
|
||||
return DAG.getNode(X86ISD::PSADBW, DL, VT, Op0, Op1);
|
||||
};
|
||||
MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
|
||||
return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
|
||||
return SplitBinaryOpsAndApply(DAG, Subtarget, DL, SadVT, SadOp0, SadOp1,
|
||||
PSADBWBuilder);
|
||||
}
|
||||
|
||||
// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
|
||||
|
|
@ -31216,10 +31222,10 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
|
|||
unsigned RegSize = 128;
|
||||
if (Subtarget.useBWIRegs())
|
||||
RegSize = 512;
|
||||
else if (Subtarget.hasAVX2())
|
||||
else if (Subtarget.hasAVX())
|
||||
RegSize = 256;
|
||||
|
||||
// We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
|
||||
// We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
|
||||
// TODO: We should be able to handle larger vectors by splitting them before
|
||||
// feeding them into several SADs, and then reducing over those.
|
||||
if (RegSize / VT.getVectorNumElements() < 8)
|
||||
|
|
@ -31254,7 +31260,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
|
|||
|
||||
// Create the SAD instruction.
|
||||
SDLoc DL(Extract);
|
||||
SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
|
||||
SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
|
||||
|
||||
// If the original vector was wider than 8 elements, sum over the results
|
||||
// in the SAD vector.
|
||||
|
|
@ -37404,10 +37410,10 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
|
|||
unsigned RegSize = 128;
|
||||
if (Subtarget.useBWIRegs())
|
||||
RegSize = 512;
|
||||
else if (Subtarget.hasAVX2())
|
||||
else if (Subtarget.hasAVX())
|
||||
RegSize = 256;
|
||||
|
||||
// We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
|
||||
// We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
|
||||
// TODO: We should be able to handle larger vectors by splitting them before
|
||||
// feeding them into several SADs, and then reducing over those.
|
||||
if (VT.getSizeInBits() / 4 > RegSize)
|
||||
|
|
@ -37433,7 +37439,7 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
|
|||
// reduction. Note that the number of elements of the result of SAD is less
|
||||
// than the number of elements of its input. Therefore, we could only update
|
||||
// part of elements in the reduction vector.
|
||||
SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
|
||||
SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);
|
||||
|
||||
// The output of PSADBW is a vector of i64.
|
||||
// We need to turn the vector of i64 into a vector of i32.
|
||||
|
|
|
|||
|
|
@ -286,76 +286,36 @@ define i32 @sad_32i8() nounwind {
|
|||
;
|
||||
; AVX1-LABEL: sad_32i8:
|
||||
; AVX1: # %bb.0: # %entry
|
||||
; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
|
||||
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
|
||||
; AVX1-NEXT: vpxor %xmm10, %xmm10, %xmm10
|
||||
; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9
|
||||
; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12
|
||||
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX1-NEXT: .p2align 4, 0x90
|
||||
; AVX1-NEXT: .LBB1_1: # %vector.body
|
||||
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpsubd %xmm5, %xmm6, %xmm5
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpsubd %xmm6, %xmm7, %xmm6
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpsubd %xmm7, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpsubd %xmm7, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpsubd %xmm7, %xmm11, %xmm7
|
||||
; AVX1-NEXT: vpabsd %xmm3, %xmm11
|
||||
; AVX1-NEXT: vpabsd %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpabsd %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpabsd %xmm6, %xmm6
|
||||
; AVX1-NEXT: vpabsd %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpabsd %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpabsd %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpabsd %xmm7, %xmm7
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
|
||||
; AVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm3
|
||||
; AVX1-NEXT: vpaddd %xmm9, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm9
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm2
|
||||
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpaddd %xmm10, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm10
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm0
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm6, %xmm0
|
||||
; AVX1-NEXT: vpaddd %xmm8, %xmm5, %xmm1
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm0
|
||||
; AVX1-NEXT: vpaddd %xmm12, %xmm11, %xmm1
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm12
|
||||
; AVX1-NEXT: vmovdqa a+1024(%rax), %ymm2
|
||||
; AVX1-NEXT: vmovdqa b+1024(%rax), %ymm3
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
|
||||
; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm4
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
|
||||
; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
|
||||
; AVX1-NEXT: addq $4, %rax
|
||||
; AVX1-NEXT: jne .LBB1_1
|
||||
; AVX1-NEXT: # %bb.2: # %middle.block
|
||||
; AVX1-NEXT: vpaddd %xmm12, %xmm10, %xmm0
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm2
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm3
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm4
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
|
||||
; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm9, %xmm0
|
||||
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm0
|
||||
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
|
|
@ -1381,50 +1341,17 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
|
|||
;
|
||||
; AVX1-LABEL: sad_nonloop_32i8:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpsubd %xmm7, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpsubd %xmm7, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpsubd %xmm7, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpsubd %xmm7, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpsubd %xmm7, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpsubd %xmm7, %xmm6, %xmm6
|
||||
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
|
||||
; AVX1-NEXT: vpsubd %xmm7, %xmm8, %xmm7
|
||||
; AVX1-NEXT: vpabsd %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpabsd %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpabsd %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpabsd %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
|
||||
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vpabsd %xmm4, %xmm1
|
||||
; AVX1-NEXT: vpabsd %xmm5, %xmm2
|
||||
; AVX1-NEXT: vpabsd %xmm6, %xmm3
|
||||
; AVX1-NEXT: vpabsd %xmm7, %xmm4
|
||||
; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3
|
||||
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
|
||||
; AVX1-NEXT: vmovdqu (%rdi), %ymm0
|
||||
; AVX1-NEXT: vmovdqu (%rdx), %ymm1
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
||||
; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm2
|
||||
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vmovd %xmm0, %eax
|
||||
; AVX1-NEXT: vzeroupper
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: sad_nonloop_32i8:
|
||||
|
|
|
|||
Loading…
Reference in New Issue