[X86] Lower AVX512 and SSE intrinsics for CVTTPD2DQ to X86ISD::CVTTPD2DQ.
Summary: This allows the SSE intrinsic to use the EVEX instruction when available. It also fixes EVEX to not use a weird (v4i32 (fp_to_sint v2f64)) node and it merges some isel patterns. This also fixes some cases that weren't combining vzmovl with cvttpd2dq to remove extra moves. Reviewers: delena, zvi, RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D26330 llvm-svn: 286344
This commit is contained in:
parent
ef1807fb73
commit
731bf9c5d6
|
|
@ -6077,6 +6077,10 @@ def : Pat<(v8f64 (extloadv8f32 addr:$src)),
|
|||
(VCVTPS2PDZrm addr:$src)>;
|
||||
|
||||
let Predicates = [HasVLX] in {
|
||||
let AddedComplexity = 15 in
|
||||
def : Pat<(X86vzmovl (v2f64 (bitconvert
|
||||
(v4f32 (X86vfpround (v2f64 VR128X:$src)))))),
|
||||
(VCVTPD2PSZ128rr VR128X:$src)>;
|
||||
def : Pat<(v2f64 (extloadv2f32 addr:$src)),
|
||||
(VCVTPS2PDZ128rm addr:$src)>;
|
||||
def : Pat<(v4f64 (extloadv4f32 addr:$src)),
|
||||
|
|
@ -6148,8 +6152,8 @@ multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr,
|
|||
}
|
||||
|
||||
// Convert Double to Signed/Unsigned Doubleword with truncation
|
||||
multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr,
|
||||
SDNode OpNode, SDNode OpNodeRnd> {
|
||||
multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
|
||||
SDNode OpNode128, SDNode OpNodeRnd> {
|
||||
let Predicates = [HasAVX512] in {
|
||||
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
|
||||
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
|
||||
|
|
@ -6157,11 +6161,11 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr,
|
|||
}
|
||||
let Predicates = [HasVLX] in {
|
||||
// we need "x"/"y" suffixes in order to distinguish between 128 and 256
|
||||
// memory forms of these instructions in Asm Parcer. They have the same
|
||||
// memory forms of these instructions in Asm Parser. They have the same
|
||||
// dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
|
||||
// due to the same reason.
|
||||
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode,
|
||||
"{1to2}", "{x}">, EVEX_V128;
|
||||
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
|
||||
OpNode128, "{1to2}", "{x}">, EVEX_V128;
|
||||
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
|
||||
"{1to4}", "{y}">, EVEX_V256;
|
||||
}
|
||||
|
|
@ -6302,7 +6306,7 @@ defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint,
|
|||
X86cvttp2siRnd>,
|
||||
XS, EVEX_CD8<32, CD8VF>;
|
||||
|
||||
defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint,
|
||||
defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, X86cvttpd2dq,
|
||||
X86cvttp2siRnd>,
|
||||
PD, VEX_W, EVEX_CD8<64, CD8VF>;
|
||||
|
||||
|
|
@ -6310,7 +6314,7 @@ defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint,
|
|||
X86cvttp2uiRnd>, PS,
|
||||
EVEX_CD8<32, CD8VF>;
|
||||
|
||||
defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint,
|
||||
defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint, fp_to_uint,
|
||||
X86cvttp2uiRnd>, PS, VEX_W,
|
||||
EVEX_CD8<64, CD8VF>;
|
||||
|
||||
|
|
@ -6408,13 +6412,10 @@ def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))),
|
|||
}
|
||||
|
||||
let Predicates = [HasAVX512, HasVLX] in {
|
||||
def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
|
||||
(v4i32 (X86cvttpd2dq (v2f64 VR128X:$src)))))))),
|
||||
let AddedComplexity = 15 in
|
||||
def : Pat<(X86vzmovl (v2i64 (bitconvert
|
||||
(v4i32 (X86cvttpd2dq (v2f64 VR128X:$src)))))),
|
||||
(VCVTTPD2DQZ128rr VR128:$src)>;
|
||||
def : Pat<(v4i32 (X86cvttpd2dq (v2f64 VR128X:$src))),
|
||||
(VCVTTPD2DQZ128rr VR128X:$src)>;
|
||||
def : Pat<(v4i32 (X86cvttpd2dq (loadv2f64 addr:$src))),
|
||||
(VCVTTPD2DQZ128rm addr:$src)>;
|
||||
}
|
||||
|
||||
let Predicates = [HasAVX512] in {
|
||||
|
|
|
|||
|
|
@ -2065,11 +2065,12 @@ let Predicates = [UseSSE2] in {
|
|||
(CVTTPS2DQrm addr:$src)>;
|
||||
}
|
||||
|
||||
let Predicates = [HasAVX, NoVLX] in
|
||||
def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
|
||||
"cvttpd2dq\t{$src, $dst|$dst, $src}",
|
||||
[(set VR128:$dst,
|
||||
(int_x86_sse2_cvttpd2dq VR128:$src))],
|
||||
IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
|
||||
(v4i32 (X86cvttpd2dq (v2f64 VR128:$src))))],
|
||||
IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
|
||||
|
||||
// The assembler can recognize rr 256-bit instructions by seeing a ymm
|
||||
// register, but the same isn't true when using memory operands instead.
|
||||
|
|
@ -2078,10 +2079,11 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
|
|||
// XMM only
|
||||
def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
|
||||
(VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
|
||||
let Predicates = [HasAVX, NoVLX] in
|
||||
def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
|
||||
"cvttpd2dqx\t{$src, $dst|$dst, $src}",
|
||||
[(set VR128:$dst, (int_x86_sse2_cvttpd2dq
|
||||
(loadv2f64 addr:$src)))],
|
||||
[(set VR128:$dst,
|
||||
(v4i32 (X86cvttpd2dq (loadv2f64 addr:$src))))],
|
||||
IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
|
||||
|
||||
// YMM only
|
||||
|
|
@ -2099,13 +2101,10 @@ def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
|
|||
(VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
|
||||
|
||||
let Predicates = [HasAVX, NoVLX] in {
|
||||
def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
|
||||
(v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))))),
|
||||
let AddedComplexity = 15 in
|
||||
def : Pat<(X86vzmovl (v2i64 (bitconvert
|
||||
(v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))),
|
||||
(VCVTTPD2DQrr VR128:$src)>;
|
||||
def : Pat<(v4i32 (X86cvttpd2dq (v2f64 VR128:$src))),
|
||||
(VCVTTPD2DQrr VR128:$src)>;
|
||||
def : Pat<(v4i32 (X86cvttpd2dq (loadv2f64 addr:$src))),
|
||||
(VCVTTPD2DQXrm addr:$src)>;
|
||||
|
||||
def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
|
||||
(VCVTTPD2DQYrr VR256:$src)>;
|
||||
|
|
@ -2125,8 +2124,9 @@ def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
|
|||
Sched<[WriteCvtF2ILd]>;
|
||||
|
||||
let Predicates = [UseSSE2] in {
|
||||
def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
|
||||
(v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))))),
|
||||
let AddedComplexity = 15 in
|
||||
def : Pat<(X86vzmovl (v2i64 (bitconvert
|
||||
(v4i32 (X86cvttpd2dq (v2f64 VR128:$src)))))),
|
||||
(CVTTPD2DQrr VR128:$src)>;
|
||||
def : Pat<(v4i32 (X86cvttpd2dq (v2f64 VR128:$src))),
|
||||
(CVTTPD2DQrr VR128:$src)>;
|
||||
|
|
@ -2254,8 +2254,9 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
|
|||
|
||||
let Predicates = [HasAVX, NoVLX] in {
|
||||
// Match fpround and fpextend for 128/256-bit conversions
|
||||
def : Pat<(v4f32 (bitconvert (X86vzmovl (v2f64 (bitconvert
|
||||
(v4f32 (X86vfpround (v2f64 VR128:$src)))))))),
|
||||
let AddedComplexity = 15 in
|
||||
def : Pat<(X86vzmovl (v2f64 (bitconvert
|
||||
(v4f32 (X86vfpround (v2f64 VR128:$src)))))),
|
||||
(VCVTPD2PSrr VR128:$src)>;
|
||||
def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
|
||||
(VCVTPD2PSrr VR128:$src)>;
|
||||
|
|
@ -2272,8 +2273,9 @@ let Predicates = [HasAVX, NoVLX] in {
|
|||
|
||||
let Predicates = [UseSSE2] in {
|
||||
// Match fpround and fpextend for 128 conversions
|
||||
def : Pat<(v4f32 (bitconvert (X86vzmovl (v2f64 (bitconvert
|
||||
(v4f32 (X86vfpround (v2f64 VR128:$src)))))))),
|
||||
let AddedComplexity = 15 in
|
||||
def : Pat<(X86vzmovl (v2f64 (bitconvert
|
||||
(v4f32 (X86vfpround (v2f64 VR128:$src)))))),
|
||||
(CVTPD2PSrr VR128:$src)>;
|
||||
def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
|
||||
(CVTPD2PSrr VR128:$src)>;
|
||||
|
|
|
|||
|
|
@ -574,7 +574,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
|
|||
X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM,
|
||||
X86ISD::VFPEXTS_RND, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK,
|
||||
ISD::FP_TO_SINT, 0),
|
||||
X86ISD::CVTTPD2DQ, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK,
|
||||
ISD::FP_TO_SINT, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK,
|
||||
|
|
@ -1636,6 +1636,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
|
|||
X86_INTRINSIC_DATA(sse2_comineq_sd, COMI, X86ISD::COMI, ISD::SETNE),
|
||||
X86_INTRINSIC_DATA(sse2_cvtdq2ps, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0),
|
||||
X86_INTRINSIC_DATA(sse2_cvtpd2ps, INTR_TYPE_1OP, X86ISD::VFPROUND, 0),
|
||||
X86_INTRINSIC_DATA(sse2_cvttpd2dq, INTR_TYPE_1OP, X86ISD::CVTTPD2DQ, 0),
|
||||
X86_INTRINSIC_DATA(sse2_max_pd, INTR_TYPE_2OP, X86ISD::FMAX, 0),
|
||||
X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0),
|
||||
X86_INTRINSIC_DATA(sse2_movmsk_pd, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
|
||||
|
|
|
|||
|
|
@ -338,10 +338,15 @@ declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind
|
|||
|
||||
|
||||
define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
|
||||
; CHECK-LABEL: test_x86_sse2_cvttpd2dq:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
|
||||
; CHECK-NEXT: retl ## encoding: [0xc3]
|
||||
; AVX-LABEL: test_x86_sse2_cvttpd2dq:
|
||||
; AVX: ## BB#0:
|
||||
; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
|
||||
; AVX-NEXT: retl ## encoding: [0xc3]
|
||||
;
|
||||
; AVX512VL-LABEL: test_x86_sse2_cvttpd2dq:
|
||||
; AVX512VL: ## BB#0:
|
||||
; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe6,0xc0]
|
||||
; AVX512VL-NEXT: retl ## encoding: [0xc3]
|
||||
%res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
|
||||
ret <4 x i32> %res
|
||||
}
|
||||
|
|
|
|||
|
|
@ -324,8 +324,6 @@ define <4 x float> @test_x86_sse2_cvtpd2ps_zext(<2 x double> %a0) nounwind {
|
|||
; SKX-LABEL: test_x86_sse2_cvtpd2ps_zext:
|
||||
; SKX: ## BB#0:
|
||||
; SKX-NEXT: vcvtpd2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x5a,0xc0]
|
||||
; SKX-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x7e,0xc0]
|
||||
; SKX-NEXT: ## xmm0 = xmm0[0],zero
|
||||
; SKX-NEXT: retl ## encoding: [0xc3]
|
||||
%cvt = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0)
|
||||
%res = shufflevector <4 x float> %cvt, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
||||
|
|
@ -502,10 +500,15 @@ define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
|
|||
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xe6,0xc0]
|
||||
; SSE-NEXT: retl ## encoding: [0xc3]
|
||||
;
|
||||
; VCHECK-LABEL: test_x86_sse2_cvttpd2dq:
|
||||
; VCHECK: ## BB#0:
|
||||
; VCHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
|
||||
; VCHECK-NEXT: retl ## encoding: [0xc3]
|
||||
; AVX2-LABEL: test_x86_sse2_cvttpd2dq:
|
||||
; AVX2: ## BB#0:
|
||||
; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
|
||||
; AVX2-NEXT: retl ## encoding: [0xc3]
|
||||
;
|
||||
; SKX-LABEL: test_x86_sse2_cvttpd2dq:
|
||||
; SKX: ## BB#0:
|
||||
; SKX-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe6,0xc0]
|
||||
; SKX-NEXT: retl ## encoding: [0xc3]
|
||||
%res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
|
||||
ret <4 x i32> %res
|
||||
}
|
||||
|
|
@ -516,22 +519,16 @@ define <2 x i64> @test_mm_cvttpd_epi32_zext(<2 x double> %a0) nounwind {
|
|||
; SSE-LABEL: test_mm_cvttpd_epi32_zext:
|
||||
; SSE: ## BB#0:
|
||||
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xe6,0xc0]
|
||||
; SSE-NEXT: movq %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x7e,0xc0]
|
||||
; SSE-NEXT: ## xmm0 = xmm0[0],zero
|
||||
; SSE-NEXT: retl ## encoding: [0xc3]
|
||||
;
|
||||
; AVX2-LABEL: test_mm_cvttpd_epi32_zext:
|
||||
; AVX2: ## BB#0:
|
||||
; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
|
||||
; AVX2-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x7e,0xc0]
|
||||
; AVX2-NEXT: ## xmm0 = xmm0[0],zero
|
||||
; AVX2-NEXT: retl ## encoding: [0xc3]
|
||||
;
|
||||
; SKX-LABEL: test_mm_cvttpd_epi32_zext:
|
||||
; SKX: ## BB#0:
|
||||
; SKX-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
|
||||
; SKX-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x7e,0xc0]
|
||||
; SKX-NEXT: ## xmm0 = xmm0[0],zero
|
||||
; SKX-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe6,0xc0]
|
||||
; SKX-NEXT: retl ## encoding: [0xc3]
|
||||
%cvt = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
|
||||
%res = shufflevector <4 x i32> %cvt, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
||||
|
|
|
|||
Loading…
Reference in New Issue