[X86] Lower AVX512 and SSE intrinsics for CVTTPD2DQ to X86ISD::CVTTPD2DQ.

Summary: This allows the SSE intrinsic to use the EVEX instruction when available. It also fixes EVEX to not use a weird (v4i32 (fp_to_sint v2f64)) node and it merges some isel patterns. This also fixes some cases that weren't combining vzmovl with cvttpd2dq to remove extra moves.

Reviewers: delena, zvi, RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D26330

llvm-svn: 286344
This commit is contained in:
Craig Topper 2016-11-09 07:31:32 +00:00
parent ef1807fb73
commit 731bf9c5d6
5 changed files with 53 additions and 47 deletions

View File

@ -6077,6 +6077,10 @@ def : Pat<(v8f64 (extloadv8f32 addr:$src)),
(VCVTPS2PDZrm addr:$src)>;
let Predicates = [HasVLX] in {
let AddedComplexity = 15 in
def : Pat<(X86vzmovl (v2f64 (bitconvert
(v4f32 (X86vfpround (v2f64 VR128X:$src)))))),
(VCVTPD2PSZ128rr VR128X:$src)>;
def : Pat<(v2f64 (extloadv2f32 addr:$src)),
(VCVTPS2PDZ128rm addr:$src)>;
def : Pat<(v4f64 (extloadv4f32 addr:$src)),
@ -6148,8 +6152,8 @@ multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr,
}
// Convert Double to Signed/Unsigned Doubleword with truncation
multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr,
SDNode OpNode, SDNode OpNodeRnd> {
multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
SDNode OpNode128, SDNode OpNodeRnd> {
let Predicates = [HasAVX512] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
@ -6157,11 +6161,11 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr,
}
let Predicates = [HasVLX] in {
// we need "x"/"y" suffixes in order to distinguish between 128 and 256
// memory forms of these instructions in Asm Parcer. They have the same
// memory forms of these instructions in Asm Parser. They have the same
// dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
// due to the same reason.
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode,
"{1to2}", "{x}">, EVEX_V128;
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
OpNode128, "{1to2}", "{x}">, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
"{1to4}", "{y}">, EVEX_V256;
}
@ -6302,7 +6306,7 @@ defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint,
X86cvttp2siRnd>,
XS, EVEX_CD8<32, CD8VF>;
defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint,
defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, X86cvttpd2dq,
X86cvttp2siRnd>,
PD, VEX_W, EVEX_CD8<64, CD8VF>;
@ -6310,7 +6314,7 @@ defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint,
X86cvttp2uiRnd>, PS,
EVEX_CD8<32, CD8VF>;
defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint,
defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint, fp_to_uint,
X86cvttp2uiRnd>, PS, VEX_W,
EVEX_CD8<64, CD8VF>;
@ -6408,13 +6412,10 @@ def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))),
}
let Predicates = [HasAVX512, HasVLX] in {
def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
(v4i32 (X86cvttpd2dq (v2f64 VR128X:$src)))))))),
let AddedComplexity = 15 in
def : Pat<(X86vzmovl (v2i64 (bitconvert
(v4i32 (X86cvttpd2dq (v2f64 VR128X:$src)))))),
(VCVTTPD2DQZ128rr VR128:$src)>;
def : Pat<(v4i32 (X86cvttpd2dq (v2f64 VR128X:$src))),
(VCVTTPD2DQZ128rr VR128X:$src)>;
def : Pat<(v4i32 (X86cvttpd2dq (loadv2f64 addr:$src))),
(VCVTTPD2DQZ128rm addr:$src)>;
}
let Predicates = [HasAVX512] in {

View File

@ -2065,11 +2065,12 @@ let Predicates = [UseSSE2] in {
(CVTTPS2DQrm addr:$src)>;
}
let Predicates = [HasAVX, NoVLX] in
def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(int_x86_sse2_cvttpd2dq VR128:$src))],
IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
(v4i32 (X86cvttpd2dq (v2f64 VR128:$src))))],
IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
// The assembler can recognize rr 256-bit instructions by seeing a ymm
// register, but the same isn't true when using memory operands instead.
@ -2078,10 +2079,11 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
// XMM only
def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
(VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
let Predicates = [HasAVX, NoVLX] in
def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttpd2dqx\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse2_cvttpd2dq
(loadv2f64 addr:$src)))],
[(set VR128:$dst,
(v4i32 (X86cvttpd2dq (loadv2f64 addr:$src))))],
IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
// YMM only
@ -2099,13 +2101,10 @@ def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
(VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
(v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))))),
let AddedComplexity = 15 in
def : Pat<(X86vzmovl (v2i64 (bitconvert
(v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))),
(VCVTTPD2DQrr VR128:$src)>;
def : Pat<(v4i32 (X86cvttpd2dq (v2f64 VR128:$src))),
(VCVTTPD2DQrr VR128:$src)>;
def : Pat<(v4i32 (X86cvttpd2dq (loadv2f64 addr:$src))),
(VCVTTPD2DQXrm addr:$src)>;
def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
(VCVTTPD2DQYrr VR256:$src)>;
@ -2125,8 +2124,9 @@ def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
Sched<[WriteCvtF2ILd]>;
let Predicates = [UseSSE2] in {
def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
(v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))))),
let AddedComplexity = 15 in
def : Pat<(X86vzmovl (v2i64 (bitconvert
(v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))),
(CVTTPD2DQrr VR128:$src)>;
def : Pat<(v4i32 (X86cvttpd2dq (v2f64 VR128:$src))),
(CVTTPD2DQrr VR128:$src)>;
@ -2254,8 +2254,9 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
let Predicates = [HasAVX, NoVLX] in {
// Match fpround and fpextend for 128/256-bit conversions
def : Pat<(v4f32 (bitconvert (X86vzmovl (v2f64 (bitconvert
(v4f32 (X86vfpround (v2f64 VR128:$src)))))))),
let AddedComplexity = 15 in
def : Pat<(X86vzmovl (v2f64 (bitconvert
(v4f32 (X86vfpround (v2f64 VR128:$src)))))),
(VCVTPD2PSrr VR128:$src)>;
def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
(VCVTPD2PSrr VR128:$src)>;
@ -2272,8 +2273,9 @@ let Predicates = [HasAVX, NoVLX] in {
let Predicates = [UseSSE2] in {
// Match fpround and fpextend for 128 conversions
def : Pat<(v4f32 (bitconvert (X86vzmovl (v2f64 (bitconvert
(v4f32 (X86vfpround (v2f64 VR128:$src)))))))),
let AddedComplexity = 15 in
def : Pat<(X86vzmovl (v2f64 (bitconvert
(v4f32 (X86vfpround (v2f64 VR128:$src)))))),
(CVTPD2PSrr VR128:$src)>;
def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
(CVTPD2PSrr VR128:$src)>;

View File

@ -574,7 +574,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::VFPEXTS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK,
ISD::FP_TO_SINT, 0),
X86ISD::CVTTPD2DQ, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK,
ISD::FP_TO_SINT, 0),
X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK,
@ -1636,6 +1636,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_comineq_sd, COMI, X86ISD::COMI, ISD::SETNE),
X86_INTRINSIC_DATA(sse2_cvtdq2ps, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0),
X86_INTRINSIC_DATA(sse2_cvtpd2ps, INTR_TYPE_1OP, X86ISD::VFPROUND, 0),
X86_INTRINSIC_DATA(sse2_cvttpd2dq, INTR_TYPE_1OP, X86ISD::CVTTPD2DQ, 0),
X86_INTRINSIC_DATA(sse2_max_pd, INTR_TYPE_2OP, X86ISD::FMAX, 0),
X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0),
X86_INTRINSIC_DATA(sse2_movmsk_pd, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),

View File

@ -338,10 +338,15 @@ declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind
define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
; CHECK-LABEL: test_x86_sse2_cvttpd2dq:
; CHECK: ## BB#0:
; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
; CHECK-NEXT: retl ## encoding: [0xc3]
; AVX-LABEL: test_x86_sse2_cvttpd2dq:
; AVX: ## BB#0:
; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
; AVX-NEXT: retl ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_sse2_cvttpd2dq:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe6,0xc0]
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}

View File

@ -324,8 +324,6 @@ define <4 x float> @test_x86_sse2_cvtpd2ps_zext(<2 x double> %a0) nounwind {
; SKX-LABEL: test_x86_sse2_cvtpd2ps_zext:
; SKX: ## BB#0:
; SKX-NEXT: vcvtpd2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x5a,0xc0]
; SKX-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x7e,0xc0]
; SKX-NEXT: ## xmm0 = xmm0[0],zero
; SKX-NEXT: retl ## encoding: [0xc3]
%cvt = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0)
%res = shufflevector <4 x float> %cvt, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
@ -502,10 +500,15 @@ define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xe6,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse2_cvttpd2dq:
; VCHECK: ## BB#0:
; VCHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
; VCHECK-NEXT: retl ## encoding: [0xc3]
; AVX2-LABEL: test_x86_sse2_cvttpd2dq:
; AVX2: ## BB#0:
; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_cvttpd2dq:
; SKX: ## BB#0:
; SKX-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe6,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@ -516,22 +519,16 @@ define <2 x i64> @test_mm_cvttpd_epi32_zext(<2 x double> %a0) nounwind {
; SSE-LABEL: test_mm_cvttpd_epi32_zext:
; SSE: ## BB#0:
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xe6,0xc0]
; SSE-NEXT: movq %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x7e,0xc0]
; SSE-NEXT: ## xmm0 = xmm0[0],zero
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_mm_cvttpd_epi32_zext:
; AVX2: ## BB#0:
; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
; AVX2-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x7e,0xc0]
; AVX2-NEXT: ## xmm0 = xmm0[0],zero
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_mm_cvttpd_epi32_zext:
; SKX: ## BB#0:
; SKX-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
; SKX-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x7e,0xc0]
; SKX-NEXT: ## xmm0 = xmm0[0],zero
; SKX-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe6,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%cvt = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
%res = shufflevector <4 x i32> %cvt, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>