[x86] vectorize cast ops in lowering to avoid register file transfers

The proposal in D56796 may cross the line because we're trying to avoid vectorization 
transforms in generic DAG combining. So this is an alternate, later, x86-specific 
translation of that patch.

There are several potential follow-ups to enhance this:
1. Allow extraction from non-zero element index.
2. Peek through extends of smaller width integers.
3. Support x86-specific conversion opcodes like X86ISD::CVTSI2P

Differential Revision: https://reviews.llvm.org/D56864

llvm-svn: 353302
This commit is contained in:
Sanjay Patel 2019-02-06 14:59:39 +00:00
parent 02974728dc
commit e84fbb67a1
4 changed files with 127 additions and 41 deletions

View File

@ -17540,6 +17540,57 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
DAG.getIntPtrConstant(0, dl)); DAG.getIntPtrConstant(0, dl));
} }
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
const X86Subtarget &Subtarget) {
switch (Opcode) {
case ISD::SINT_TO_FP:
// TODO: Handle wider types with AVX/AVX512.
if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
return false;
// CVTDQ2PS or (V)CVTDQ2PD
return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
case ISD::UINT_TO_FP:
// TODO: Handle wider types and i64 elements.
if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
return false;
// VCVTUDQ2PS or VCVTUDQ2PD
return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
default:
return false;
}
}
/// Given a scalar cast operation that is extracted from a vector, try to
/// vectorize the cast op followed by extraction. This will avoid an expensive
/// round-trip between XMM and GPR.
static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// TODO: The limitation for extracting from the 0-element is not required,
// but if we extract from some other element, it will require shuffling to
// get the result into the right place.
// TODO: This could be enhanced to handle smaller integer types by peeking
// through an extend.
SDValue Extract = Cast.getOperand(0);
MVT DestVT = Cast.getSimpleValueType();
if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isNullConstant(Extract.getOperand(1)))
return SDValue();
SDValue VecOp = Extract.getOperand(0);
MVT FromVT = VecOp.getSimpleValueType();
MVT ToVT = MVT::getVectorVT(DestVT, FromVT.getVectorNumElements());
if (!useVectorCast(Cast.getOpcode(), FromVT, ToVT, Subtarget))
return SDValue();
// cast (extract V, Y) --> extract (cast V), Y
SDLoc DL(Cast);
SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
Extract.getOperand(1));
}
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const { SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0); SDValue Src = Op.getOperand(0);
@ -17547,6 +17598,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
MVT VT = Op.getSimpleValueType(); MVT VT = Op.getSimpleValueType();
SDLoc dl(Op); SDLoc dl(Op);
if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
return Extract;
if (SrcVT.isVector()) { if (SrcVT.isVector()) {
if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
return DAG.getNode(X86ISD::CVTSI2P, dl, VT, return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
@ -17909,6 +17963,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
if (Op.getSimpleValueType().isVector()) if (Op.getSimpleValueType().isVector())
return lowerUINT_TO_FP_vec(Op, DAG, Subtarget); return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
return Extract;
MVT SrcVT = N0.getSimpleValueType(); MVT SrcVT = N0.getSimpleValueType();
MVT DstVT = Op.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType();

View File

@ -25,8 +25,7 @@ define float @knownbits_mask_extract_uitofp(<2 x i64> %a0) nounwind {
; X32: # %bb.0: ; X32: # %bb.0:
; X32-NEXT: pushl %eax ; X32-NEXT: pushl %eax
; X32-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; X32-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; X32-NEXT: vmovd %xmm0, %eax ; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp) ; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax ; X32-NEXT: popl %eax

View File

@ -92,8 +92,7 @@ define float @signbits_ashr_extract_sitofp_1(<2 x i64> %a0) nounwind {
; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,32768,0,0,1,0,0,0] ; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,32768,0,0,1,0,0,0]
; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; X32-NEXT: vmovd %xmm0, %eax ; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp) ; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax ; X32-NEXT: popl %eax
@ -120,8 +119,7 @@ define float @signbits_ashr_shl_extract_sitofp(<2 x i64> %a0) nounwind {
; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; X32-NEXT: vpsllq $20, %xmm0, %xmm0 ; X32-NEXT: vpsllq $20, %xmm0, %xmm0
; X32-NEXT: vmovd %xmm0, %eax ; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp) ; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax ; X32-NEXT: popl %eax
@ -152,8 +150,7 @@ define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwin
; X32-NEXT: vmovd %eax, %xmm0 ; X32-NEXT: vmovd %eax, %xmm0
; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; X32-NEXT: vpsrlq $3, %xmm0, %xmm0 ; X32-NEXT: vpsrlq $3, %xmm0, %xmm0
; X32-NEXT: vmovd %xmm0, %eax ; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp) ; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax ; X32-NEXT: popl %eax
@ -239,8 +236,7 @@ define float @signbits_ashr_sext_sextinreg_and_extract_sitofp(<2 x i64> %a0, <2
; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; X32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X32-NEXT: vpand %xmm1, %xmm0, %xmm0
; X32-NEXT: vmovd %xmm0, %eax ; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp) ; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax ; X32-NEXT: popl %eax
@ -283,8 +279,7 @@ define float @signbits_ashr_sextvecinreg_bitops_extract_sitofp(<2 x i64> %a0, <4
; X32-NEXT: vpand %xmm1, %xmm0, %xmm2 ; X32-NEXT: vpand %xmm1, %xmm0, %xmm2
; X32-NEXT: vpor %xmm1, %xmm2, %xmm1 ; X32-NEXT: vpor %xmm1, %xmm2, %xmm1
; X32-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; X32-NEXT: vpxor %xmm0, %xmm1, %xmm0
; X32-NEXT: vmovd %xmm0, %eax ; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp) ; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp) ; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax ; X32-NEXT: popl %eax

View File

@ -5556,15 +5556,12 @@ define <4 x float> @sitofp_i64_to_4f32(<4 x float> %a0, i64 %a1) nounwind {
define float @extract0_sitofp_v4i32_f32(<4 x i32> %x) nounwind { define float @extract0_sitofp_v4i32_f32(<4 x i32> %x) nounwind {
; SSE-LABEL: extract0_sitofp_v4i32_f32: ; SSE-LABEL: extract0_sitofp_v4i32_f32:
; SSE: # %bb.0: ; SSE: # %bb.0:
; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssl %eax, %xmm0
; SSE-NEXT: retq ; SSE-NEXT: retq
; ;
; AVX-LABEL: extract0_sitofp_v4i32_f32: ; AVX-LABEL: extract0_sitofp_v4i32_f32:
; AVX: # %bb.0: ; AVX: # %bb.0:
; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0
; AVX-NEXT: retq ; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0 %e = extractelement <4 x i32> %x, i32 0
%r = sitofp i32 %e to float %r = sitofp i32 %e to float
@ -5575,8 +5572,7 @@ define float @extract0_sitofp_v4i32_f32i_multiuse1(<4 x i32> %x) nounwind {
; SSE-LABEL: extract0_sitofp_v4i32_f32i_multiuse1: ; SSE-LABEL: extract0_sitofp_v4i32_f32i_multiuse1:
; SSE: # %bb.0: ; SSE: # %bb.0:
; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssl %eax, %xmm0
; SSE-NEXT: incl %eax ; SSE-NEXT: incl %eax
; SSE-NEXT: cvtsi2ssl %eax, %xmm1 ; SSE-NEXT: cvtsi2ssl %eax, %xmm1
; SSE-NEXT: divss %xmm1, %xmm0 ; SSE-NEXT: divss %xmm1, %xmm0
@ -5585,7 +5581,7 @@ define float @extract0_sitofp_v4i32_f32i_multiuse1(<4 x i32> %x) nounwind {
; AVX-LABEL: extract0_sitofp_v4i32_f32i_multiuse1: ; AVX-LABEL: extract0_sitofp_v4i32_f32i_multiuse1:
; AVX: # %bb.0: ; AVX: # %bb.0:
; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT: incl %eax ; AVX-NEXT: incl %eax
; AVX-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm1 ; AVX-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm1
; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
@ -5601,17 +5597,15 @@ define float @extract0_sitofp_v4i32_f32i_multiuse1(<4 x i32> %x) nounwind {
define float @extract0_sitofp_v4i32_f32_multiuse2(<4 x i32> %x, i32* %p) nounwind { define float @extract0_sitofp_v4i32_f32_multiuse2(<4 x i32> %x, i32* %p) nounwind {
; SSE-LABEL: extract0_sitofp_v4i32_f32_multiuse2: ; SSE-LABEL: extract0_sitofp_v4i32_f32_multiuse2:
; SSE: # %bb.0: ; SSE: # %bb.0:
; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
; SSE-NEXT: cvtsi2ssl %eax, %xmm1 ; SSE-NEXT: movss %xmm0, (%rdi)
; SSE-NEXT: movd %xmm0, (%rdi)
; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq ; SSE-NEXT: retq
; ;
; AVX-LABEL: extract0_sitofp_v4i32_f32_multiuse2: ; AVX-LABEL: extract0_sitofp_v4i32_f32_multiuse2:
; AVX: # %bb.0: ; AVX: # %bb.0:
; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm1
; AVX-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm1 ; AVX-NEXT: vmovss %xmm0, (%rdi)
; AVX-NEXT: vmovd %xmm0, (%rdi)
; AVX-NEXT: vmovaps %xmm1, %xmm0 ; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq ; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0 %e = extractelement <4 x i32> %x, i32 0
@ -5630,8 +5624,7 @@ define double @extract0_sitofp_v4i32_f64(<4 x i32> %x) nounwind {
; ;
; AVX-LABEL: extract0_sitofp_v4i32_f64: ; AVX-LABEL: extract0_sitofp_v4i32_f64:
; AVX: # %bb.0: ; AVX: # %bb.0:
; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: vcvtsi2sdl %eax, %xmm1, %xmm0
; AVX-NEXT: retq ; AVX-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0 %e = extractelement <4 x i32> %x, i32 0
%r = sitofp i32 %e to double %r = sitofp i32 %e to double
@ -5652,11 +5645,31 @@ define float @extract0_uitofp_v4i32_f32(<4 x i32> %x) nounwind {
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm0 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm0
; VEX-NEXT: retq ; VEX-NEXT: retq
; ;
; AVX512-LABEL: extract0_uitofp_v4i32_f32: ; AVX512F-LABEL: extract0_uitofp_v4i32_f32:
; AVX512: # %bb.0: ; AVX512F: # %bb.0:
; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512-NEXT: vcvtusi2ssl %eax, %xmm1, %xmm0 ; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
; AVX512-NEXT: retq ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: extract0_uitofp_v4i32_f32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: extract0_uitofp_v4i32_f32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f32:
; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0
; AVX512VLDQ-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0 %e = extractelement <4 x i32> %x, i32 0
%r = uitofp i32 %e to float %r = uitofp i32 %e to float
ret float %r ret float %r
@ -5676,11 +5689,35 @@ define double @extract0_uitofp_v4i32_f64(<4 x i32> %x) nounwind {
; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm0 ; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm0
; VEX-NEXT: retq ; VEX-NEXT: retq
; ;
; AVX512-LABEL: extract0_uitofp_v4i32_f64: ; AVX512F-LABEL: extract0_uitofp_v4i32_f64:
; AVX512: # %bb.0: ; AVX512F: # %bb.0:
; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX512-NEXT: vcvtusi2sdl %eax, %xmm1, %xmm0 ; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512-NEXT: retq ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: extract0_uitofp_v4i32_f64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0
; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: extract0_uitofp_v4i32_f64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f64:
; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0
; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%e = extractelement <4 x i32> %x, i32 0 %e = extractelement <4 x i32> %x, i32 0
%r = uitofp i32 %e to double %r = uitofp i32 %e to double
ret double %r ret double %r
@ -5692,9 +5729,7 @@ define float @extract3_sitofp_v4i32_f32(<4 x i32> %x) nounwind {
; SSE2-LABEL: extract3_sitofp_v4i32_f32: ; SSE2-LABEL: extract3_sitofp_v4i32_f32:
; SSE2: # %bb.0: ; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE2-NEXT: xorps %xmm0, %xmm0
; SSE2-NEXT: cvtsi2ssl %eax, %xmm0
; SSE2-NEXT: retq ; SSE2-NEXT: retq
; ;
; SSE41-LABEL: extract3_sitofp_v4i32_f32: ; SSE41-LABEL: extract3_sitofp_v4i32_f32: