This commit is contained in:
Tanner Gooding 2025-07-30 15:56:36 +02:00 committed by GitHub
commit 3d0bf88e75
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
45 changed files with 1151 additions and 2682 deletions

View File

@ -40,8 +40,8 @@ For AOT compilation, the situation is far more complex. This is due to the follo
## Crossgen2 model of hardware intrinsic usage
There are 2 sets of instruction sets known to the compiler.
- The baseline instruction set which defaults to (Sse, Sse2), but may be adjusted via compiler option.
- The optimistic instruction set which defaults to (Sse3, Ssse3, Sse41, Sse42, Popcnt, Pclmulqdq, and Lzcnt).
- The baseline instruction set which defaults to x86-64-v2 (SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, and POPCNT), but may be adjusted via compiler option.
- The optimistic instruction set which defaults to (AES, GFNI, SHA, WAITPKG, and X86SERIALIZE).
Code will be compiled using the optimistic instruction set to drive compilation, but any use of an instruction set beyond the baseline instruction set will be recorded, as will any attempt to use an instruction set beyond the optimistic set if that attempted use has a semantic effect. If the baseline instruction set includes `Avx2` then the size and characteristics of of `Vector<T>` is known. Any other decisions about ABI may also be encoded. For instance, it is likely that the ABI of `Vector256<T>` and `Vector512<T>` will vary based on the presence/absence of `Avx` support.

View File

@ -359,74 +359,31 @@ jobs:
- jitstress_random_2
${{ if in(parameters.testGroup, 'jitstress-isas-arm') }}:
scenarios:
- jitstress_isas_incompletehwintrinsic
- jitstress_isas_nohwintrinsic
- jitstress_isas_nohwintrinsic_nosimd
- jitstress_isas_nosimd
${{ if in(parameters.testGroup, 'jitstress-isas-x86') }}:
scenarios:
- jitstress_isas_incompletehwintrinsic
- jitstress_isas_nohwintrinsic
- jitstress_isas_nohwintrinsic_nosimd
- jitstress_isas_nosimd
- jitstress_isas_x86_evex
- jitstress_isas_x86_noaes
- jitstress_isas_x86_noavx
- jitstress_isas_x86_noavx2
- jitstress_isas_x86_noavx512
- jitstress_isas_x86_nobmi1
- jitstress_isas_x86_nobmi2
- jitstress_isas_x86_nofma
- jitstress_isas_x86_nohwintrinsic
- jitstress_isas_x86_nolzcnt
- jitstress_isas_x86_nopclmulqdq
- jitstress_isas_x86_nopopcnt
- jitstress_isas_x86_nosse
- jitstress_isas_x86_nosse2
- jitstress_isas_x86_nosse3
- jitstress_isas_x86_nosse3_4
- jitstress_isas_x86_nosse41
- jitstress_isas_x86_nosse42
- jitstress_isas_x86_nossse3
- jitstress_isas_x86_vectort128
- jitstress_isas_x86_vectort512
- jitstress_isas_x86_noavx512_vectort128
- jitstress_isas_1_x86_noaes
- jitstress_isas_1_x86_evex
- jitstress_isas_1_x86_noavx
- jitstress_isas_1_x86_noavx2
- jitstress_isas_1_x86_noavx512
- jitstress_isas_1_x86_nobmi1
- jitstress_isas_1_x86_nobmi2
- jitstress_isas_1_x86_nofma
- jitstress_isas_1_x86_nohwintrinsic
- jitstress_isas_1_x86_nolzcnt
- jitstress_isas_1_x86_nopclmulqdq
- jitstress_isas_1_x86_nopopcnt
- jitstress_isas_1_x86_nosse
- jitstress_isas_1_x86_nosse2
- jitstress_isas_1_x86_nosse3
- jitstress_isas_1_x86_nosse3_4
- jitstress_isas_1_x86_nosse41
- jitstress_isas_1_x86_nosse42
- jitstress_isas_1_x86_nossse3
- jitstress_isas_2_x86_noaes
- jitstress_isas_1_x86_vectort128
- jitstress_isas_1_x86_vectort512
- jitstress_isas_1_x86_noavx512_vectort128
- jitstress_isas_2_x86_evex
- jitstress_isas_2_x86_noavx
- jitstress_isas_2_x86_noavx2
- jitstress_isas_2_x86_noavx512
- jitstress_isas_2_x86_nobmi1
- jitstress_isas_2_x86_nobmi2
- jitstress_isas_2_x86_nofma
- jitstress_isas_2_x86_nohwintrinsic
- jitstress_isas_2_x86_nolzcnt
- jitstress_isas_2_x86_nopclmulqdq
- jitstress_isas_2_x86_nopopcnt
- jitstress_isas_2_x86_nosse
- jitstress_isas_2_x86_nosse2
- jitstress_isas_2_x86_nosse3
- jitstress_isas_2_x86_nosse3_4
- jitstress_isas_2_x86_nosse41
- jitstress_isas_2_x86_nosse42
- jitstress_isas_2_x86_nossse3
- jitstress_isas_2_x86_vectort128
- jitstress_isas_2_x86_vectort512
- jitstress_isas_2_x86_noavx512_vectort128
${{ if in(parameters.testGroup, 'jitstress-isas-avx512') }}:
scenarios:
- jitstress_isas_x86_evex

View File

@ -669,7 +669,6 @@ RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableHWIntrinsic, W("EnableHWIntri
#endif // defined(TARGET_LOONGARCH64)
#if defined(TARGET_AMD64) || defined(TARGET_X86)
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableSSE42, W("EnableSSE42"), 1, "Allows SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, and dependent hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX, W("EnableAVX"), 1, "Allows AVX and dependent hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX2, W("EnableAVX2"), 1, "Allows AVX2, BMI1, BMI2, F16C, FMA, LZCNT, MOVBE and dependent hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX512, W("EnableAVX512"), 1, "Allows AVX512 F+BW+CD+DQ+VL and depdendent hardware intrinsics to be disabled")

View File

@ -51,101 +51,97 @@ enum CORINFO_InstructionSet
#endif // TARGET_RISCV64
#ifdef TARGET_AMD64
InstructionSet_X86Base=1,
InstructionSet_SSE42=2,
InstructionSet_AVX=3,
InstructionSet_AVX2=4,
InstructionSet_AVX512=5,
InstructionSet_AVX512v2=6,
InstructionSet_AVX512v3=7,
InstructionSet_AVX10v1=8,
InstructionSet_AVX10v2=9,
InstructionSet_APX=10,
InstructionSet_AES=11,
InstructionSet_AES_V256=12,
InstructionSet_AES_V512=13,
InstructionSet_AVX512VP2INTERSECT=14,
InstructionSet_AVXIFMA=15,
InstructionSet_AVXVNNI=16,
InstructionSet_GFNI=17,
InstructionSet_GFNI_V256=18,
InstructionSet_GFNI_V512=19,
InstructionSet_SHA=20,
InstructionSet_WAITPKG=21,
InstructionSet_X86Serialize=22,
InstructionSet_Vector128=23,
InstructionSet_Vector256=24,
InstructionSet_Vector512=25,
InstructionSet_VectorT128=26,
InstructionSet_VectorT256=27,
InstructionSet_VectorT512=28,
InstructionSet_AVXVNNIINT=29,
InstructionSet_AVXVNNIINT_V512=30,
InstructionSet_X86Base_X64=31,
InstructionSet_SSE42_X64=32,
InstructionSet_AVX_X64=33,
InstructionSet_AVX2_X64=34,
InstructionSet_AVX512_X64=35,
InstructionSet_AVX512v2_X64=36,
InstructionSet_AVX512v3_X64=37,
InstructionSet_AVX10v1_X64=38,
InstructionSet_AVX10v2_X64=39,
InstructionSet_AES_X64=40,
InstructionSet_AVX512VP2INTERSECT_X64=41,
InstructionSet_AVXIFMA_X64=42,
InstructionSet_AVXVNNI_X64=43,
InstructionSet_GFNI_X64=44,
InstructionSet_SHA_X64=45,
InstructionSet_WAITPKG_X64=46,
InstructionSet_X86Serialize_X64=47,
InstructionSet_AVX=2,
InstructionSet_AVX2=3,
InstructionSet_AVX512=4,
InstructionSet_AVX512v2=5,
InstructionSet_AVX512v3=6,
InstructionSet_AVX10v1=7,
InstructionSet_AVX10v2=8,
InstructionSet_APX=9,
InstructionSet_AES=10,
InstructionSet_AES_V256=11,
InstructionSet_AES_V512=12,
InstructionSet_AVX512VP2INTERSECT=13,
InstructionSet_AVXIFMA=14,
InstructionSet_AVXVNNI=15,
InstructionSet_GFNI=16,
InstructionSet_GFNI_V256=17,
InstructionSet_GFNI_V512=18,
InstructionSet_SHA=19,
InstructionSet_WAITPKG=20,
InstructionSet_X86Serialize=21,
InstructionSet_Vector128=22,
InstructionSet_Vector256=23,
InstructionSet_Vector512=24,
InstructionSet_VectorT128=25,
InstructionSet_VectorT256=26,
InstructionSet_VectorT512=27,
InstructionSet_AVXVNNIINT=28,
InstructionSet_AVXVNNIINT_V512=29,
InstructionSet_X86Base_X64=30,
InstructionSet_AVX_X64=31,
InstructionSet_AVX2_X64=32,
InstructionSet_AVX512_X64=33,
InstructionSet_AVX512v2_X64=34,
InstructionSet_AVX512v3_X64=35,
InstructionSet_AVX10v1_X64=36,
InstructionSet_AVX10v2_X64=37,
InstructionSet_AES_X64=38,
InstructionSet_AVX512VP2INTERSECT_X64=39,
InstructionSet_AVXIFMA_X64=40,
InstructionSet_AVXVNNI_X64=41,
InstructionSet_GFNI_X64=42,
InstructionSet_SHA_X64=43,
InstructionSet_WAITPKG_X64=44,
InstructionSet_X86Serialize_X64=45,
#endif // TARGET_AMD64
#ifdef TARGET_X86
InstructionSet_X86Base=1,
InstructionSet_SSE42=2,
InstructionSet_AVX=3,
InstructionSet_AVX2=4,
InstructionSet_AVX512=5,
InstructionSet_AVX512v2=6,
InstructionSet_AVX512v3=7,
InstructionSet_AVX10v1=8,
InstructionSet_AVX10v2=9,
InstructionSet_APX=10,
InstructionSet_AES=11,
InstructionSet_AES_V256=12,
InstructionSet_AES_V512=13,
InstructionSet_AVX512VP2INTERSECT=14,
InstructionSet_AVXIFMA=15,
InstructionSet_AVXVNNI=16,
InstructionSet_GFNI=17,
InstructionSet_GFNI_V256=18,
InstructionSet_GFNI_V512=19,
InstructionSet_SHA=20,
InstructionSet_WAITPKG=21,
InstructionSet_X86Serialize=22,
InstructionSet_Vector128=23,
InstructionSet_Vector256=24,
InstructionSet_Vector512=25,
InstructionSet_VectorT128=26,
InstructionSet_VectorT256=27,
InstructionSet_VectorT512=28,
InstructionSet_AVXVNNIINT=29,
InstructionSet_AVXVNNIINT_V512=30,
InstructionSet_X86Base_X64=31,
InstructionSet_SSE42_X64=32,
InstructionSet_AVX_X64=33,
InstructionSet_AVX2_X64=34,
InstructionSet_AVX512_X64=35,
InstructionSet_AVX512v2_X64=36,
InstructionSet_AVX512v3_X64=37,
InstructionSet_AVX10v1_X64=38,
InstructionSet_AVX10v2_X64=39,
InstructionSet_AES_X64=40,
InstructionSet_AVX512VP2INTERSECT_X64=41,
InstructionSet_AVXIFMA_X64=42,
InstructionSet_AVXVNNI_X64=43,
InstructionSet_GFNI_X64=44,
InstructionSet_SHA_X64=45,
InstructionSet_WAITPKG_X64=46,
InstructionSet_X86Serialize_X64=47,
InstructionSet_AVX=2,
InstructionSet_AVX2=3,
InstructionSet_AVX512=4,
InstructionSet_AVX512v2=5,
InstructionSet_AVX512v3=6,
InstructionSet_AVX10v1=7,
InstructionSet_AVX10v2=8,
InstructionSet_APX=9,
InstructionSet_AES=10,
InstructionSet_AES_V256=11,
InstructionSet_AES_V512=12,
InstructionSet_AVX512VP2INTERSECT=13,
InstructionSet_AVXIFMA=14,
InstructionSet_AVXVNNI=15,
InstructionSet_GFNI=16,
InstructionSet_GFNI_V256=17,
InstructionSet_GFNI_V512=18,
InstructionSet_SHA=19,
InstructionSet_WAITPKG=20,
InstructionSet_X86Serialize=21,
InstructionSet_Vector128=22,
InstructionSet_Vector256=23,
InstructionSet_Vector512=24,
InstructionSet_VectorT128=25,
InstructionSet_VectorT256=26,
InstructionSet_VectorT512=27,
InstructionSet_AVXVNNIINT=28,
InstructionSet_AVXVNNIINT_V512=29,
InstructionSet_X86Base_X64=30,
InstructionSet_AVX_X64=31,
InstructionSet_AVX2_X64=32,
InstructionSet_AVX512_X64=33,
InstructionSet_AVX512v2_X64=34,
InstructionSet_AVX512v3_X64=35,
InstructionSet_AVX10v1_X64=36,
InstructionSet_AVX10v2_X64=37,
InstructionSet_AES_X64=38,
InstructionSet_AVX512VP2INTERSECT_X64=39,
InstructionSet_AVXIFMA_X64=40,
InstructionSet_AVXVNNI_X64=41,
InstructionSet_GFNI_X64=42,
InstructionSet_SHA_X64=43,
InstructionSet_WAITPKG_X64=44,
InstructionSet_X86Serialize_X64=45,
#endif // TARGET_X86
};
@ -267,8 +263,6 @@ public:
#ifdef TARGET_AMD64
if (HasInstructionSet(InstructionSet_X86Base))
AddInstructionSet(InstructionSet_X86Base_X64);
if (HasInstructionSet(InstructionSet_SSE42))
AddInstructionSet(InstructionSet_SSE42_X64);
if (HasInstructionSet(InstructionSet_AVX))
AddInstructionSet(InstructionSet_AVX_X64);
if (HasInstructionSet(InstructionSet_AVX2))
@ -395,10 +389,6 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins
resultflags.RemoveInstructionSet(InstructionSet_X86Base);
if (resultflags.HasInstructionSet(InstructionSet_X86Base_X64) && !resultflags.HasInstructionSet(InstructionSet_X86Base))
resultflags.RemoveInstructionSet(InstructionSet_X86Base_X64);
if (resultflags.HasInstructionSet(InstructionSet_SSE42) && !resultflags.HasInstructionSet(InstructionSet_SSE42_X64))
resultflags.RemoveInstructionSet(InstructionSet_SSE42);
if (resultflags.HasInstructionSet(InstructionSet_SSE42_X64) && !resultflags.HasInstructionSet(InstructionSet_SSE42))
resultflags.RemoveInstructionSet(InstructionSet_SSE42_X64);
if (resultflags.HasInstructionSet(InstructionSet_AVX) && !resultflags.HasInstructionSet(InstructionSet_AVX_X64))
resultflags.RemoveInstructionSet(InstructionSet_AVX);
if (resultflags.HasInstructionSet(InstructionSet_AVX_X64) && !resultflags.HasInstructionSet(InstructionSet_AVX))
@ -459,9 +449,7 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins
resultflags.RemoveInstructionSet(InstructionSet_X86Serialize);
if (resultflags.HasInstructionSet(InstructionSet_X86Serialize_X64) && !resultflags.HasInstructionSet(InstructionSet_X86Serialize))
resultflags.RemoveInstructionSet(InstructionSet_X86Serialize_X64);
if (resultflags.HasInstructionSet(InstructionSet_SSE42) && !resultflags.HasInstructionSet(InstructionSet_X86Base))
resultflags.RemoveInstructionSet(InstructionSet_SSE42);
if (resultflags.HasInstructionSet(InstructionSet_AVX) && !resultflags.HasInstructionSet(InstructionSet_SSE42))
if (resultflags.HasInstructionSet(InstructionSet_AVX) && !resultflags.HasInstructionSet(InstructionSet_X86Base))
resultflags.RemoveInstructionSet(InstructionSet_AVX);
if (resultflags.HasInstructionSet(InstructionSet_AVX2) && !resultflags.HasInstructionSet(InstructionSet_AVX))
resultflags.RemoveInstructionSet(InstructionSet_AVX2);
@ -491,7 +479,7 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins
resultflags.RemoveInstructionSet(InstructionSet_AVXIFMA);
if (resultflags.HasInstructionSet(InstructionSet_AVXVNNI) && !resultflags.HasInstructionSet(InstructionSet_AVX2))
resultflags.RemoveInstructionSet(InstructionSet_AVXVNNI);
if (resultflags.HasInstructionSet(InstructionSet_GFNI) && !resultflags.HasInstructionSet(InstructionSet_SSE42))
if (resultflags.HasInstructionSet(InstructionSet_GFNI) && !resultflags.HasInstructionSet(InstructionSet_X86Base))
resultflags.RemoveInstructionSet(InstructionSet_GFNI);
if (resultflags.HasInstructionSet(InstructionSet_GFNI_V256) && !resultflags.HasInstructionSet(InstructionSet_GFNI))
resultflags.RemoveInstructionSet(InstructionSet_GFNI_V256);
@ -525,9 +513,7 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins
resultflags.RemoveInstructionSet(InstructionSet_VectorT512);
#endif // TARGET_AMD64
#ifdef TARGET_X86
if (resultflags.HasInstructionSet(InstructionSet_SSE42) && !resultflags.HasInstructionSet(InstructionSet_X86Base))
resultflags.RemoveInstructionSet(InstructionSet_SSE42);
if (resultflags.HasInstructionSet(InstructionSet_AVX) && !resultflags.HasInstructionSet(InstructionSet_SSE42))
if (resultflags.HasInstructionSet(InstructionSet_AVX) && !resultflags.HasInstructionSet(InstructionSet_X86Base))
resultflags.RemoveInstructionSet(InstructionSet_AVX);
if (resultflags.HasInstructionSet(InstructionSet_AVX2) && !resultflags.HasInstructionSet(InstructionSet_AVX))
resultflags.RemoveInstructionSet(InstructionSet_AVX2);
@ -557,7 +543,7 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins
resultflags.RemoveInstructionSet(InstructionSet_AVXIFMA);
if (resultflags.HasInstructionSet(InstructionSet_AVXVNNI) && !resultflags.HasInstructionSet(InstructionSet_AVX2))
resultflags.RemoveInstructionSet(InstructionSet_AVXVNNI);
if (resultflags.HasInstructionSet(InstructionSet_GFNI) && !resultflags.HasInstructionSet(InstructionSet_SSE42))
if (resultflags.HasInstructionSet(InstructionSet_GFNI) && !resultflags.HasInstructionSet(InstructionSet_X86Base))
resultflags.RemoveInstructionSet(InstructionSet_GFNI);
if (resultflags.HasInstructionSet(InstructionSet_GFNI_V256) && !resultflags.HasInstructionSet(InstructionSet_GFNI))
resultflags.RemoveInstructionSet(InstructionSet_GFNI_V256);
@ -673,10 +659,6 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet)
return "X86Base";
case InstructionSet_X86Base_X64 :
return "X86Base_X64";
case InstructionSet_SSE42 :
return "SSE42";
case InstructionSet_SSE42_X64 :
return "SSE42_X64";
case InstructionSet_AVX :
return "AVX";
case InstructionSet_AVX_X64 :
@ -767,8 +749,6 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet)
#ifdef TARGET_X86
case InstructionSet_X86Base :
return "X86Base";
case InstructionSet_SSE42 :
return "SSE42";
case InstructionSet_AVX :
return "AVX";
case InstructionSet_AVX2 :
@ -869,11 +849,11 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst
case READYTORUN_INSTRUCTION_X86Base: return InstructionSet_X86Base;
case READYTORUN_INSTRUCTION_Sse: return InstructionSet_X86Base;
case READYTORUN_INSTRUCTION_Sse2: return InstructionSet_X86Base;
case READYTORUN_INSTRUCTION_Sse42: return InstructionSet_SSE42;
case READYTORUN_INSTRUCTION_Sse3: return InstructionSet_SSE42;
case READYTORUN_INSTRUCTION_Ssse3: return InstructionSet_SSE42;
case READYTORUN_INSTRUCTION_Sse41: return InstructionSet_SSE42;
case READYTORUN_INSTRUCTION_Popcnt: return InstructionSet_SSE42;
case READYTORUN_INSTRUCTION_Sse42: return InstructionSet_X86Base;
case READYTORUN_INSTRUCTION_Sse3: return InstructionSet_X86Base;
case READYTORUN_INSTRUCTION_Ssse3: return InstructionSet_X86Base;
case READYTORUN_INSTRUCTION_Sse41: return InstructionSet_X86Base;
case READYTORUN_INSTRUCTION_Popcnt: return InstructionSet_X86Base;
case READYTORUN_INSTRUCTION_Avx: return InstructionSet_AVX;
case READYTORUN_INSTRUCTION_Avx2: return InstructionSet_AVX2;
case READYTORUN_INSTRUCTION_Bmi1: return InstructionSet_AVX2;
@ -938,11 +918,11 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst
case READYTORUN_INSTRUCTION_X86Base: return InstructionSet_X86Base;
case READYTORUN_INSTRUCTION_Sse: return InstructionSet_X86Base;
case READYTORUN_INSTRUCTION_Sse2: return InstructionSet_X86Base;
case READYTORUN_INSTRUCTION_Sse42: return InstructionSet_SSE42;
case READYTORUN_INSTRUCTION_Sse3: return InstructionSet_SSE42;
case READYTORUN_INSTRUCTION_Ssse3: return InstructionSet_SSE42;
case READYTORUN_INSTRUCTION_Sse41: return InstructionSet_SSE42;
case READYTORUN_INSTRUCTION_Popcnt: return InstructionSet_SSE42;
case READYTORUN_INSTRUCTION_Sse42: return InstructionSet_X86Base;
case READYTORUN_INSTRUCTION_Sse3: return InstructionSet_X86Base;
case READYTORUN_INSTRUCTION_Ssse3: return InstructionSet_X86Base;
case READYTORUN_INSTRUCTION_Sse41: return InstructionSet_X86Base;
case READYTORUN_INSTRUCTION_Popcnt: return InstructionSet_X86Base;
case READYTORUN_INSTRUCTION_Avx: return InstructionSet_AVX;
case READYTORUN_INSTRUCTION_Avx2: return InstructionSet_AVX2;
case READYTORUN_INSTRUCTION_Bmi1: return InstructionSet_AVX2;

View File

@ -37,11 +37,11 @@
#include <minipal/guid.h>
constexpr GUID JITEEVersionIdentifier = { /* 2d40ec46-2e41-4a8b-8349-3c1267b95821 */
0x2d40ec46,
0x2e41,
0x4a8b,
{0x83, 0x49, 0x3c, 0x12, 0x67, 0xb9, 0x58, 0x21}
constexpr GUID JITEEVersionIdentifier = { /* 4c03a921-f305-47db-a9bb-c7ec4a1b83d8 */
0x4c03a921,
0xf305,
0x47db,
{0xa9, 0xbb, 0xc7, 0xec, 0x4a, 0x1b, 0x83, 0xd8}
};
#endif // JIT_EE_VERSIONING_GUID_H

View File

@ -251,17 +251,16 @@ bool IntegralRange::Contains(int64_t value) const
case NI_X86Base_CompareScalarUnorderedLessThan:
case NI_X86Base_CompareScalarUnorderedGreaterThanOrEqual:
case NI_X86Base_CompareScalarUnorderedGreaterThan:
case NI_SSE42_TestC:
case NI_SSE42_TestZ:
case NI_SSE42_TestNotZAndNotC:
case NI_X86Base_TestC:
case NI_X86Base_TestZ:
case NI_X86Base_TestNotZAndNotC:
case NI_AVX_TestC:
case NI_AVX_TestZ:
case NI_AVX_TestNotZAndNotC:
return {SymbolicIntegerValue::Zero, SymbolicIntegerValue::One};
case NI_X86Base_Extract:
case NI_SSE42_Extract:
case NI_SSE42_X64_Extract:
case NI_X86Base_X64_Extract:
case NI_Vector128_ToScalar:
case NI_Vector256_ToScalar:
case NI_Vector512_ToScalar:
@ -278,8 +277,8 @@ bool IntegralRange::Contains(int64_t value) const
case NI_AVX2_TrailingZeroCount:
case NI_AVX2_X64_LeadingZeroCount:
case NI_AVX2_X64_TrailingZeroCount:
case NI_SSE42_PopCount:
case NI_SSE42_X64_PopCount:
case NI_X86Base_PopCount:
case NI_X86Base_X64_PopCount:
// Note: No advantage in using a precise range for IntegralRange.
// Example: IntCns = 42 gives [0..127] with a non -precise range, [42,42] with a precise range.
return {SymbolicIntegerValue::Zero, SymbolicIntegerValue::ByteMax};

View File

@ -47,11 +47,11 @@ public:
private:
#if defined(TARGET_XARCH)
// Generates SSE2 code for the given tree as "Operand BitWiseOp BitMask"
void genSSE2BitwiseOp(GenTree* treeNode);
// Generates intrinsic code for the given tree as "Operand BitWiseOp BitMask"
void genIntrinsicBitwiseOp(GenTree* treeNode);
// Generates SSE42 code for the given tree as a round operation
void genSSE42RoundOp(GenTreeOp* treeNode);
// Generates intrinsic code for the given tree as a round operation
void genIntrinsicRoundOp(GenTreeOp* treeNode);
instruction simdAlignedMovIns()
{
@ -941,7 +941,6 @@ protected:
void genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genX86BaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genSse42Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genFmaIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genPermuteVar2x(GenTreeHWIntrinsic* node, insOpts instOptions);

View File

@ -707,7 +707,7 @@ void CodeGen::genCodeForNegNot(GenTree* tree)
if (varTypeIsFloating(targetType))
{
assert(tree->OperIs(GT_NEG));
genSSE2BitwiseOp(tree);
genIntrinsicBitwiseOp(tree);
}
else
{
@ -1447,18 +1447,7 @@ void CodeGen::genSIMDSplitReturn(GenTree* src, const ReturnTypeDesc* retTypeDesc
inst_Mov(TYP_INT, reg0, opReg, /* canSkip */ false);
// reg1 = opRef[61:32]
if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
inst_RV_TT_IV(INS_pextrd, EA_4BYTE, reg1, src, 1, INS_OPTS_NONE);
}
else
{
bool isRMW = !compiler->canUseVexEncoding();
int8_t shuffleMask = 1; // we only need [61:32]->[31:0], the rest is not read.
inst_RV_RV_TT_IV(INS_pshufd, EA_8BYTE, opReg, opReg, src, shuffleMask, isRMW, INS_OPTS_NONE);
inst_Mov(TYP_INT, reg1, opReg, /* canSkip */ false);
}
inst_RV_TT_IV(INS_pextrd, EA_4BYTE, reg1, src, 1, INS_OPTS_NONE);
#endif // TARGET_X86
}
@ -2474,17 +2463,7 @@ void CodeGen::genMultiRegStoreToSIMDLocal(GenTreeLclVar* lclNode)
inst_Mov(TYP_FLOAT, targetReg, reg0, /* canSkip */ false);
const emitAttr size = emitTypeSize(TYP_SIMD8);
if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
GetEmitter()->emitIns_SIMD_R_R_R_I(INS_pinsrd, size, targetReg, targetReg, reg1, 1, INS_OPTS_NONE);
}
else
{
regNumber tempXmm = internalRegisters.GetSingle(lclNode);
assert(tempXmm != targetReg);
inst_Mov(TYP_FLOAT, tempXmm, reg1, /* canSkip */ false);
GetEmitter()->emitIns_SIMD_R_R_R(INS_punpckldq, size, targetReg, targetReg, tempXmm, INS_OPTS_NONE);
}
GetEmitter()->emitIns_SIMD_R_R_R_I(INS_pinsrd, size, targetReg, targetReg, reg1, 1, INS_OPTS_NONE);
genProduceReg(lclNode);
}
#elif defined(TARGET_AMD64)
@ -5805,8 +5784,7 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
}
case NI_X86Base_Extract:
case NI_SSE42_Extract:
case NI_SSE42_X64_Extract:
case NI_X86Base_X64_Extract:
case NI_AVX_ExtractVector128:
case NI_AVX2_ExtractVector128:
case NI_AVX512_ExtractVector128:
@ -5822,15 +5800,6 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
switch (ins)
{
case INS_pextrw:
{
// The encoding which supports containment is SSE4.1+ only
assert(compiler->compIsaSupportedDebugOnly(InstructionSet_SSE42));
ins = INS_pextrw_sse42;
break;
}
case INS_vextractf64x2:
{
ins = INS_vextractf32x4;
@ -7757,7 +7726,7 @@ int CodeGenInterface::genCallerSPtoInitialSPdelta() const
#endif // TARGET_AMD64
//-----------------------------------------------------------------------------------------
// genSSE2BitwiseOp - generate SSE2 code for the given oper as "Operand BitWiseOp BitMask"
// genIntrinsicBitwiseOp - generate intrinsic code for the given oper as "Operand BitWiseOp BitMask"
//
// Arguments:
// treeNode - tree node
@ -7769,7 +7738,7 @@ int CodeGenInterface::genCallerSPtoInitialSPdelta() const
// i) tree oper is one of GT_NEG or GT_INTRINSIC Abs()
// ii) tree type is floating point type.
// iii) caller of this routine needs to call genProduceReg()
void CodeGen::genSSE2BitwiseOp(GenTree* treeNode)
void CodeGen::genIntrinsicBitwiseOp(GenTree* treeNode)
{
regNumber targetReg = treeNode->GetRegNum();
regNumber operandReg = genConsumeReg(treeNode->gtGetOp1());
@ -7800,7 +7769,7 @@ void CodeGen::genSSE2BitwiseOp(GenTree* treeNode)
}
else
{
assert(!"genSSE2BitwiseOp: unsupported oper");
assert(!"genIntrinsicBitwiseOp: unsupported oper");
}
simd16_t constValue;
@ -7816,7 +7785,7 @@ void CodeGen::genSSE2BitwiseOp(GenTree* treeNode)
}
//-----------------------------------------------------------------------------------------
// genSSE42RoundOp - generate SSE42 code for the given tree as a round operation
// genIntrinsicRoundOp - generate intrinsic code for the given tree as a round operation
//
// Arguments:
// treeNode - tree node
@ -7825,17 +7794,13 @@ void CodeGen::genSSE2BitwiseOp(GenTree* treeNode)
// None
//
// Assumptions:
// i) SSE4.2 is supported by the underlying hardware
// ii) treeNode oper is a GT_INTRINSIC
// iii) treeNode type is a floating point type
// iv) treeNode is not used from memory
// v) tree oper is NI_System_Math{F}_Round, _Ceiling, _Floor, or _Truncate
// vi) caller of this routine needs to call genProduceReg()
void CodeGen::genSSE42RoundOp(GenTreeOp* treeNode)
// i) treeNode oper is a GT_INTRINSIC
// ii) treeNode type is a floating point type
// iii) treeNode is not used from memory
// iv) tree oper is NI_System_Math{F}_Round, _Ceiling, _Floor, or _Truncate
// v) caller of this routine needs to call genProduceReg()
void CodeGen::genIntrinsicRoundOp(GenTreeOp* treeNode)
{
// i) SSE4.2 is supported by the underlying hardware
assert(compiler->compIsaSupportedDebugOnly(InstructionSet_SSE42));
// ii) treeNode oper is a GT_INTRINSIC
assert(treeNode->OperIs(GT_INTRINSIC));
@ -7878,7 +7843,7 @@ void CodeGen::genSSE42RoundOp(GenTreeOp* treeNode)
default:
ins = INS_invalid;
assert(!"genSSE42RoundOp: unsupported intrinsic");
assert(!"genRoundOp: unsupported intrinsic");
unreached();
}
@ -7901,14 +7866,14 @@ void CodeGen::genIntrinsic(GenTreeIntrinsic* treeNode)
switch (treeNode->gtIntrinsicName)
{
case NI_System_Math_Abs:
genSSE2BitwiseOp(treeNode);
genIntrinsicBitwiseOp(treeNode);
break;
case NI_System_Math_Ceiling:
case NI_System_Math_Floor:
case NI_System_Math_Truncate:
case NI_System_Math_Round:
genSSE42RoundOp(treeNode->AsOp());
genIntrinsicRoundOp(treeNode->AsOp());
break;
case NI_System_Math_Sqrt:

View File

@ -6061,11 +6061,6 @@ int Compiler::compCompile(CORINFO_MODULE_HANDLE classPtr,
instructionSetFlags.AddInstructionSet(InstructionSet_X86Base);
if (JitConfig.EnableSSE42() != 0)
{
instructionSetFlags.AddInstructionSet(InstructionSet_SSE42);
}
if (JitConfig.EnableAVX() != 0)
{
instructionSetFlags.AddInstructionSet(InstructionSet_AVX);

View File

@ -1960,24 +1960,10 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicToScalar(LIR::Use& use, GenTreeHWIn
simdTmpVar = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTmpVar->TypeGet());
Range().InsertAfter(loResult, simdTmpVar);
GenTree* hiResult;
if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
GenTree* one = m_compiler->gtNewIconNode(1);
hiResult = m_compiler->gtNewSimdGetElementNode(TYP_INT, simdTmpVar, one, CORINFO_TYPE_INT, simdSize);
Range().InsertAfter(simdTmpVar, one, hiResult);
}
else
{
GenTree* thirtyTwo = m_compiler->gtNewIconNode(32);
GenTree* shift = m_compiler->gtNewSimdBinOpNode(GT_RSZ, op1->TypeGet(), simdTmpVar, thirtyTwo,
node->GetSimdBaseJitType(), simdSize);
hiResult = m_compiler->gtNewSimdToScalarNode(TYP_INT, shift, CORINFO_TYPE_INT, simdSize);
Range().InsertAfter(simdTmpVar, thirtyTwo, shift, hiResult);
}
GenTree* one = m_compiler->gtNewIconNode(1);
GenTree* hiResult = m_compiler->gtNewSimdGetElementNode(TYP_INT, simdTmpVar, one, CORINFO_TYPE_INT, simdSize);
Range().InsertAfter(simdTmpVar, one, hiResult);
Range().Remove(node);
return FinalizeDecomposition(use, loResult, hiResult, hiResult);

View File

@ -8292,12 +8292,8 @@ void emitter::emitSimdConstCompressedLoad(simd_t* constValue, emitAttr attr, reg
if ((dataSize == 16) && (constValue->u64[1] == constValue->u64[0]))
{
if (((cnsSize == 16) && emitComp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) ||
emitComp->compOpportunisticallyDependsOn(InstructionSet_AVX))
{
dataSize = 8;
ins = (cnsSize == 16) ? INS_movddup : INS_vbroadcastsd;
}
dataSize = 8;
ins = (cnsSize == 16) ? INS_movddup : INS_vbroadcastsd;
}
// `vbroadcastss` fills the full SIMD register, so we can't do this last step if the

View File

@ -4024,7 +4024,6 @@ bool emitter::emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id)
case INS_pextrd:
case INS_pextrq:
case INS_pextrw:
case INS_pextrw_sse42:
case INS_rorx:
case INS_shlx:
case INS_sarx:
@ -7003,35 +7002,8 @@ void emitter::emitStoreSimd12ToLclOffset(unsigned varNum, unsigned offset, regNu
// Store lower 8 bytes
emitIns_S_R(INS_movsd_simd, EA_8BYTE, dataReg, varNum, offset);
if (emitComp->compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
// Extract and store upper 4 bytes
emitIns_S_R_I(INS_extractps, EA_16BYTE, varNum, offset + 8, dataReg, 2);
}
else if (tmpRegProvider != nullptr)
{
regNumber tmpReg = codeGen->internalRegisters.GetSingle(tmpRegProvider);
assert(isFloatReg(tmpReg));
// Extract upper 4 bytes from data
emitIns_R_R(INS_movhlps, EA_16BYTE, tmpReg, dataReg);
// Store upper 4 bytes
emitIns_S_R(INS_movss, EA_4BYTE, tmpReg, varNum, offset + 8);
}
else
{
// We don't have temp regs - let's do two shuffles then
// [0,1,2,3] -> [2,3,0,1]
emitIns_R_R_I(INS_pshufd, EA_16BYTE, dataReg, dataReg, 78);
// Store upper 4 bytes
emitIns_S_R(INS_movss, EA_4BYTE, dataReg, varNum, offset + 8);
// Restore dataReg to its previous state: [2,3,0,1] -> [0,1,2,3]
emitIns_R_R_I(INS_pshufd, EA_16BYTE, dataReg, dataReg, 78);
}
// Extract and store upper 4 bytes
emitIns_S_R_I(INS_extractps, EA_16BYTE, varNum, offset + 8, dataReg, 2);
}
#endif // FEATURE_SIMD
@ -13628,7 +13600,6 @@ void emitter::emitDispIns(
case INS_extractps:
case INS_pextrb:
case INS_pextrw:
case INS_pextrw_sse42:
case INS_pextrd:
{
tgtAttr = EA_4BYTE;

View File

@ -1182,8 +1182,8 @@ void Compiler::fgFindJumpTargets(const BYTE* codeAddr, IL_OFFSET codeSize, Fixed
case NI_AVX2_TrailingZeroCount:
case NI_AVX2_X64_LeadingZeroCount:
case NI_AVX2_X64_TrailingZeroCount:
case NI_SSE42_PopCount:
case NI_SSE42_X64_PopCount:
case NI_X86Base_PopCount:
case NI_X86Base_X64_PopCount:
case NI_Vector256_Create:
case NI_Vector512_Create:
case NI_Vector256_CreateScalar:

File diff suppressed because it is too large Load Diff

View File

@ -942,7 +942,6 @@ static const HWIntrinsicIsaRange hwintrinsicIsaRangeArray[] = {
// clang-format off
#if defined(TARGET_XARCH)
{ FIRST_NI_X86Base, LAST_NI_X86Base }, // X86Base
{ FIRST_NI_SSE42, LAST_NI_SSE42 }, // SSE42
{ FIRST_NI_AVX, LAST_NI_AVX }, // AVX
{ FIRST_NI_AVX2, LAST_NI_AVX2 }, // AVX2
{ FIRST_NI_AVX512, LAST_NI_AVX512 }, // AVX512
@ -973,7 +972,6 @@ static const HWIntrinsicIsaRange hwintrinsicIsaRangeArray[] = {
{ FIRST_NI_AVXVNNIINT_V512, LAST_NI_AVXVNNIINT_V512 }, // AVXVNNIINT_V512
{ FIRST_NI_X86Base_X64, LAST_NI_X86Base_X64 }, // X86Base_X64
{ FIRST_NI_SSE42_X64, LAST_NI_SSE42_X64 }, // SSE42_X64
{ NI_Illegal, NI_Illegal }, // AVX_X64
{ FIRST_NI_AVX2_X64, LAST_NI_AVX2_X64 }, // AVX2_X64
{ FIRST_NI_AVX512_X64, LAST_NI_AVX512_X64 }, // AVX512_X64
@ -2265,9 +2263,9 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
#if defined(TARGET_XARCH)
switch (intrinsic)
{
case NI_SSE42_ConvertToVector128Int16:
case NI_SSE42_ConvertToVector128Int32:
case NI_SSE42_ConvertToVector128Int64:
case NI_X86Base_ConvertToVector128Int16:
case NI_X86Base_ConvertToVector128Int32:
case NI_X86Base_ConvertToVector128Int64:
case NI_AVX2_BroadcastScalarToVector128:
case NI_AVX2_BroadcastScalarToVector256:
case NI_AVX2_ConvertToVector256Int16:
@ -2323,7 +2321,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
: gtNewSimdHWIntrinsicNode(nodeRetType, op1, op2, intrinsic, simdBaseJitType, simdSize);
#ifdef TARGET_XARCH
if ((intrinsic == NI_SSE42_Crc32) || (intrinsic == NI_SSE42_X64_Crc32))
if ((intrinsic == NI_X86Base_Crc32) || (intrinsic == NI_X86Base_X64_Crc32))
{
// TODO-XArch-Cleanup: currently we use the simdBaseJitType to bring the type of the second argument
// to the code generator. May encode the overload info in other way.

View File

@ -869,7 +869,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
{
switch (intrinsicId)
{
case NI_SSE42_BlendVariable:
case NI_X86Base_BlendVariable:
case NI_AVX_BlendVariable:
case NI_AVX2_BlendVariable:
case NI_AVX512_BlendVariableMask:
@ -1005,13 +1005,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}
case InstructionSet_SSE42:
case InstructionSet_SSE42_X64:
{
genSse42Intrinsic(node, instOptions);
break;
}
case InstructionSet_AVX:
case InstructionSet_AVX2:
case InstructionSet_AVX2_X64:
@ -1908,19 +1901,9 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
if (!canCombineLoad)
{
if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
genHWIntrinsic_R_RM(node, ins, baseAttr, targetReg, loPart, instOptions);
inst_RV_RV_TT_IV(INS_pinsrd, EA_16BYTE, targetReg, targetReg, hiPart, 0x01,
!compiler->canUseVexEncoding(), instOptions);
}
else
{
regNumber tmpReg = internalRegisters.GetSingle(node);
genHWIntrinsic_R_RM(node, ins, baseAttr, targetReg, loPart, instOptions);
genHWIntrinsic_R_RM(node, ins, baseAttr, tmpReg, hiPart, instOptions);
emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, targetReg, tmpReg, instOptions);
}
genHWIntrinsic_R_RM(node, ins, baseAttr, targetReg, loPart, instOptions);
inst_RV_RV_TT_IV(INS_pinsrd, EA_16BYTE, targetReg, targetReg, hiPart, 0x01,
!compiler->canUseVexEncoding(), instOptions);
break;
}
@ -1961,26 +1944,17 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
if (baseType == TYP_FLOAT)
{
if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
// insertps imm8 is:
// * Bits 0-3: zmask
// * Bits 4-5: count_d
// * Bits 6-7: count_s (register form only)
//
// We want zmask 0b1110 (0xE) to zero elements 1/2/3
// We want count_d 0b00 (0x0) to insert the value to element 0
// We want count_s 0b00 (0x0) as we're just taking element 0 of the source
// insertps imm8 is:
// * Bits 0-3: zmask
// * Bits 4-5: count_d
// * Bits 6-7: count_s (register form only)
//
// We want zmask 0b1110 (0xE) to zero elements 1/2/3
// We want count_d 0b00 (0x0) to insert the value to element 0
// We want count_s 0b00 (0x0) as we're just taking element 0 of the source
emit->emitIns_SIMD_R_R_R_I(INS_insertps, attr, targetReg, targetReg, op1Reg, 0x0E,
instOptions);
}
else
{
assert(targetReg != op1Reg);
emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg, instOptions);
emit->emitIns_Mov(INS_movss, attr, targetReg, op1Reg, /* canSkip */ false);
}
emit->emitIns_SIMD_R_R_R_I(INS_insertps, attr, targetReg, targetReg, op1Reg, 0x0E,
instOptions);
}
else
{
@ -2145,15 +2119,7 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
{
if (ival == 1)
{
if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
emit->emitIns_R_R(INS_movshdup, attr, targetReg, op1Reg);
}
else
{
emit->emitIns_SIMD_R_R_R_I(INS_shufps, attr, targetReg, op1Reg, op1Reg,
static_cast<int8_t>(0x55), instOptions);
}
emit->emitIns_R_R(INS_movshdup, attr, targetReg, op1Reg);
}
else if (ival == 2)
{
@ -2564,40 +2530,11 @@ void CodeGen::genX86BaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
break;
}
default:
unreached();
break;
}
genProduceReg(node);
}
//------------------------------------------------------------------------
// genSse42Intrinsic: Generates the code for an SSE4.2 hardware intrinsic node
//
// Arguments:
// node - The hardware intrinsic node
//
void CodeGen::genSse42Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
{
NamedIntrinsic intrinsicId = node->GetHWIntrinsicId();
regNumber targetReg = node->GetRegNum();
GenTree* op1 = node->Op(1);
var_types baseType = node->GetSimdBaseType();
var_types targetType = node->TypeGet();
emitter* emit = GetEmitter();
assert(targetReg != REG_NA);
assert(!node->OperIsCommutative());
genConsumeMultiOpOperands(node);
switch (intrinsicId)
{
case NI_SSE42_ConvertToVector128Int16:
case NI_SSE42_ConvertToVector128Int32:
case NI_SSE42_ConvertToVector128Int64:
case NI_X86Base_ConvertToVector128Int16:
case NI_X86Base_ConvertToVector128Int32:
case NI_X86Base_ConvertToVector128Int64:
{
GenTree* op1 = node->Op(1);
instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType, compiler);
if (!varTypeIsSIMD(op1->TypeGet()))
@ -2614,12 +2551,13 @@ void CodeGen::genSse42Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
break;
}
case NI_SSE42_Crc32:
case NI_SSE42_X64_Crc32:
case NI_X86Base_Crc32:
case NI_X86Base_X64_Crc32:
{
assert(instOptions == INS_OPTS_NONE);
instruction ins = INS_crc32;
GenTree* op1 = node->Op(1);
regNumber op1Reg = op1->GetRegNum();
GenTree* op2 = node->Op(2);
@ -2671,12 +2609,11 @@ void CodeGen::genSse42Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
break;
}
case NI_SSE42_Extract:
case NI_SSE42_X64_Extract:
case NI_X86Base_Extract:
case NI_X86Base_X64_Extract:
{
assert(!varTypeIsFloating(baseType));
instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType, compiler);
GenTree* op1 = node->Op(1);
GenTree* op2 = node->Op(2);
emitAttr attr = emitActualTypeSize(targetType);
@ -2703,18 +2640,16 @@ void CodeGen::genSse42Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
break;
}
case NI_SSE42_PopCount:
case NI_SSE42_X64_PopCount:
case NI_X86Base_PopCount:
case NI_X86Base_X64_PopCount:
{
genXCNTIntrinsic(node, INS_popcnt);
break;
}
default:
{
unreached();
break;
}
}
genProduceReg(node);

View File

@ -419,20 +419,27 @@ HARDWARE_INTRINSIC(Vector512, op_UnsignedRightShift,
// ISA Function name SIMD size NumArg Instructions Category Flags
// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE}
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// Intrinsics for X86Base, SSE, SSE2
#define FIRST_NI_X86Base NI_X86Base_Add
// Intrinsics for X86Base, SSE, SSE2, SSE3, SSSE3, SSE41, SSE42, POPCNT
#define FIRST_NI_X86Base NI_X86Base_Abs
HARDWARE_INTRINSIC(X86Base, Abs, 16, 1, {INS_pabsb, INS_invalid, INS_pabsw, INS_invalid, INS_pabsd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, Add, 16, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base, AddSaturate, 16, 2, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base, AddScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_addsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(X86Base, AddSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(X86Base, AlignRight, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(X86Base, And, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pandd, INS_pandd, INS_pandd, INS_pandd, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(X86Base, AndNot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pandnd, INS_pandnd, INS_pandnd, INS_pandnd, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(X86Base, Average, 16, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base, BitScanForward, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsf, INS_bsf, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, BitScanReverse, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsr, INS_bsr, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, CompareEqual, 16, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(X86Base, CompareGreaterThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(X86Base, Blend, 16, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(X86Base, BlendVariable, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(X86Base, Ceiling, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, CeilingScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(X86Base, CompareEqual, 16, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(X86Base, CompareGreaterThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(X86Base, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(X86Base, CompareLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(X86Base, CompareLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(X86Base, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(X86Base, CompareNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(X86Base, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport)
@ -473,39 +480,59 @@ HARDWARE_INTRINSIC(X86Base, ConvertToInt32,
HARDWARE_INTRINSIC(X86Base, ConvertToInt32WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si32, INS_cvttsd2si32}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, ConvertToUInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, ConvertToVector128Double, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtdq2pd, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2pd, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, ConvertToVector128Int32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2dq, INS_cvtpd2dq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, ConvertToVector128Int16, 16, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad)
HARDWARE_INTRINSIC(X86Base, ConvertToVector128Int32, 16, 1, {INS_pmovsxbd, INS_pmovzxbd, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2dq, INS_cvtpd2dq}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad)
HARDWARE_INTRINSIC(X86Base, ConvertToVector128Int32WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttps2dq, INS_cvttpd2dq}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, ConvertToVector128Int64, 16, 1, {INS_pmovsxbq, INS_pmovzxbq, INS_pmovsxwq, INS_pmovzxwq, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad)
HARDWARE_INTRINSIC(X86Base, ConvertToVector128Single, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtdq2ps, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2ps}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, Crc32, 0, 2, {INS_invalid, INS_crc32, INS_invalid, INS_crc32, INS_invalid, INS_crc32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_RmwIntrinsic)
HARDWARE_INTRINSIC(X86Base, DivRem, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_idiv, INS_div, INS_idiv, INS_div, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_BaseTypeFromSecondArg|HW_Flag_MultiReg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_RmwIntrinsic)
HARDWARE_INTRINSIC(X86Base, Divide, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_divpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(X86Base, DivideScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divss, INS_divsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(X86Base, Extract, 16, 2, {INS_invalid, INS_invalid, INS_pextrw, INS_pextrw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, Insert, 16, 3, {INS_invalid, INS_invalid, INS_pinsrw, INS_pinsrw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_CanBenefitFromConstantProp)
HARDWARE_INTRINSIC(X86Base, DotProduct, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_dppd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(X86Base, Extract, 16, 2, {INS_pextrb, INS_pextrb, INS_pextrw, INS_pextrw, INS_pextrd, INS_pextrd, INS_invalid, INS_invalid, INS_extractps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, FloorScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(X86Base, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(X86Base, HorizontalAddSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(X86Base, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(X86Base, HorizontalSubtractSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(X86Base, Insert, 16, 3, {INS_pinsrb, INS_pinsrb, INS_pinsrw, INS_pinsrw, INS_pinsrd, INS_pinsrd, INS_invalid, INS_invalid, INS_insertps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_CanBenefitFromConstantProp)
HARDWARE_INTRINSIC(X86Base, LoadAlignedVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(X86Base, LoadAlignedVector128NonTemporal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(X86Base, LoadAndDuplicateToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, LoadDquVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(X86Base, LoadFence, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialSideEffect_Barrier)
HARDWARE_INTRINSIC(X86Base, LoadHigh, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_movhpd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, LoadLow, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_movlpd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, LoadScalarVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd32, INS_movd32, INS_movq, INS_movq, INS_movss, INS_movsd_simd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, LoadVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(X86Base, MaskMove, 16, 3, {INS_maskmovdqu, INS_maskmovdqu, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg)
HARDWARE_INTRINSIC(X86Base, Max, 16, 2, {INS_invalid, INS_pmaxub, INS_pmaxsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxps, INS_maxpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative)
HARDWARE_INTRINSIC(X86Base, Max, 16, 2, {INS_pmaxsb, INS_pmaxub, INS_pmaxsw, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_maxps, INS_maxpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative)
HARDWARE_INTRINSIC(X86Base, MaxScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxss, INS_maxsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(X86Base, MemoryFence, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialSideEffect_Barrier)
HARDWARE_INTRINSIC(X86Base, Min, 16, 2, {INS_invalid, INS_pminub, INS_pminsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minps, INS_minpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative)
HARDWARE_INTRINSIC(X86Base, Min, 16, 2, {INS_pminsb, INS_pminub, INS_pminsw, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_minps, INS_minpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative)
HARDWARE_INTRINSIC(X86Base, MinHorizontal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_phminposuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(X86Base, MinScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minss, INS_minsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(X86Base, MoveAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, MoveHighAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, MoveHighToLow, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment)
HARDWARE_INTRINSIC(X86Base, MoveLowAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, MoveLowToHigh, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment)
HARDWARE_INTRINSIC(X86Base, MoveMask, 16, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(X86Base, MoveScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movq, INS_movq, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_NoContainment)
HARDWARE_INTRINSIC(X86Base, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuludq, INS_mulps, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base, MultiplyAddAdjacent, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmaddwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base, MultipleSumAbsoluteDifferences, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(X86Base, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_pmuludq, INS_mulps, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base, MultiplyAddAdjacent, 16, 2, {INS_invalid, INS_invalid, INS_pmaddubsw, INS_invalid, INS_pmaddwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative)
HARDWARE_INTRINSIC(X86Base, MultiplyHigh, 16, 2, {INS_invalid, INS_invalid, INS_pmulhw, INS_pmulhuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base, MultiplyLow, 16, 2, {INS_invalid, INS_invalid, INS_pmullw, INS_pmullw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base, MultiplyHighRoundScale, 16, 2, {INS_invalid, INS_invalid, INS_pmulhrsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(X86Base, MultiplyLow, 16, 2, {INS_invalid, INS_invalid, INS_pmullw, INS_pmullw, INS_pmulld, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(X86Base, MultiplyScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulss, INS_mulsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(X86Base, Or, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pord, INS_pord, INS_pord, INS_pord, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(X86Base, PackSignedSaturate, 16, 2, {INS_packsswb, INS_invalid, INS_packssdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(X86Base, PackUnsignedSaturate, 16, 2, {INS_invalid, INS_packuswb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(X86Base, PackUnsignedSaturate, 16, 2, {INS_invalid, INS_packuswb, INS_invalid, INS_packusdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(X86Base, Pause, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialSideEffect_Other)
HARDWARE_INTRINSIC(X86Base, PopCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_popcnt, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(X86Base, Prefetch0, 0, 1, {INS_invalid, INS_prefetcht0, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialSideEffect_Other)
HARDWARE_INTRINSIC(X86Base, Prefetch1, 0, 1, {INS_invalid, INS_prefetcht1, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialSideEffect_Other)
HARDWARE_INTRINSIC(X86Base, Prefetch2, 0, 1, {INS_invalid, INS_prefetcht2, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialSideEffect_Other)
@ -514,14 +541,25 @@ HARDWARE_INTRINSIC(X86Base, Reciprocal,
HARDWARE_INTRINSIC(X86Base, ReciprocalScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(X86Base, ReciprocalSqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(X86Base, ReciprocalSqrtScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(X86Base, RoundCurrentDirection, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, RoundCurrentDirectionScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(X86Base, RoundToNearestInteger, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, RoundToNearestIntegerScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(X86Base, RoundToNegativeInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, RoundToNegativeInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(X86Base, RoundToPositiveInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, RoundToPositiveInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(X86Base, RoundToZero, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, RoundToZeroScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(X86Base, ShiftLeftLogical, 16, 2, {INS_invalid, INS_invalid, INS_psllw, INS_psllw, INS_pslld, INS_pslld, INS_psllq, INS_psllq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(X86Base, ShiftLeftLogical128BitLane, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pslldq, INS_pslldq, INS_pslldq, INS_pslldq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(X86Base, ShiftRightArithmetic, 16, 2, {INS_invalid, INS_invalid, INS_psraw, INS_invalid, INS_psrad, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(X86Base, ShiftRightLogical, 16, 2, {INS_invalid, INS_invalid, INS_psrlw, INS_psrlw, INS_psrld, INS_psrld, INS_psrlq, INS_psrlq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(X86Base, ShiftRightLogical128BitLane, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_psrldq, INS_psrldq, INS_psrldq, INS_psrldq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(X86Base, Shuffle, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pshufd, INS_pshufd, INS_invalid, INS_invalid, INS_shufps, INS_shufpd}, HW_Category_IMM, HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(X86Base, Shuffle, 16, -1, {INS_pshufb, INS_pshufb, INS_invalid, INS_invalid, INS_pshufd, INS_pshufd, INS_invalid, INS_invalid, INS_shufps, INS_shufpd}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(X86Base, ShuffleHigh, 16, 2, {INS_invalid, INS_invalid, INS_pshufhw, INS_pshufhw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(X86Base, ShuffleLow, 16, 2, {INS_invalid, INS_invalid, INS_pshuflw, INS_pshuflw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(X86Base, Sign, 16, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(X86Base, Sqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_sqrtpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, SqrtScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtss, INS_sqrtsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(X86Base, Store, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromSecondArg)
@ -536,6 +574,9 @@ HARDWARE_INTRINSIC(X86Base, Subtract,
HARDWARE_INTRINSIC(X86Base, SubtractSaturate, 16, 2, {INS_psubsb, INS_psubusb, INS_psubsw, INS_psubusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(X86Base, SubtractScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_subsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(X86Base, SumAbsoluteDifferences, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_psadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(X86Base, TestC, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(X86Base, TestNotZAndNotC, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(X86Base, TestZ, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(X86Base, UnpackHigh, 16, 2, {INS_punpckhbw, INS_punpckhbw, INS_punpckhwd, INS_punpckhwd, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(X86Base, UnpackLow, 16, 2, {INS_punpcklbw, INS_punpcklbw, INS_punpcklwd, INS_punpcklwd, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(X86Base, Xor, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pxord, INS_pxord, INS_pxord, INS_pxord, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_CanBenefitFromConstantProp|HW_Flag_NormalizeSmallTypeToInt)
@ -545,7 +586,7 @@ HARDWARE_INTRINSIC(X86Base, Xor,
// ISA Function name SIMD size NumArg Instructions Category Flags
// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE}
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// 64-bit only Intrinsics for X86Base, SSE, SSE2
// 64-bit only Intrinsics for X86Base, SSE, SSE2, SSE3, SSSE3, SSE41, SSE42, POPCNT
#define FIRST_NI_X86Base_X64 NI_X86Base_X64_BitScanForward
HARDWARE_INTRINSIC(X86Base_X64, BitScanForward, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsf, INS_bsf, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base_X64, BitScanReverse, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsr, INS_bsr, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
@ -556,88 +597,14 @@ HARDWARE_INTRINSIC(X86Base_X64, ConvertScalarToVector128UInt64,
HARDWARE_INTRINSIC(X86Base_X64, ConvertToInt64, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd64, INS_invalid, INS_cvtss2si64, INS_cvtsd2si64}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(X86Base_X64, ConvertToInt64WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si64, INS_cvttsd2si64}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(X86Base_X64, ConvertToUInt64, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd64, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(X86Base_X64, Crc32, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_crc32, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_RmwIntrinsic)
HARDWARE_INTRINSIC(X86Base_X64, DivRem, 0, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_idiv, INS_div, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_BaseTypeFromSecondArg|HW_Flag_MultiReg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_RmwIntrinsic)
HARDWARE_INTRINSIC(X86Base_X64, Extract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pextrq, INS_pextrq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base_X64, Insert, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pinsrq, INS_pinsrq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_CanBenefitFromConstantProp)
HARDWARE_INTRINSIC(X86Base_X64, PopCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_popcnt, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(X86Base_X64, StoreNonTemporal, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movnti64, INS_movnti64, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromSecondArg)
#define LAST_NI_X86Base_X64 NI_X86Base_X64_StoreNonTemporal
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// ISA Function name SIMD size NumArg Instructions Category Flags
// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE}
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// Intrinsics for SSE3, SSSE3, SSE41, SSE42, POPCNT
#define FIRST_NI_SSE42 NI_SSE42_Abs
HARDWARE_INTRINSIC(SSE42, Abs, 16, 1, {INS_pabsb, INS_invalid, INS_pabsw, INS_invalid, INS_pabsd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE42, AddSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(SSE42, AlignRight, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(SSE42, Blend, 16, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(SSE42, BlendVariable, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(SSE42, Ceiling, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE42, CeilingScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(SSE42, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(SSE42, CompareGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(SSE42, CompareLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(SSE42, ConvertToVector128Int16, 16, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad)
HARDWARE_INTRINSIC(SSE42, ConvertToVector128Int32, 16, 1, {INS_pmovsxbd, INS_pmovzxbd, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad)
HARDWARE_INTRINSIC(SSE42, ConvertToVector128Int64, 16, 1, {INS_pmovsxbq, INS_pmovzxbq, INS_pmovsxwq, INS_pmovzxwq, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad)
HARDWARE_INTRINSIC(SSE42, Crc32, 0, 2, {INS_invalid, INS_crc32, INS_invalid, INS_crc32, INS_invalid, INS_crc32, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_RmwIntrinsic)
HARDWARE_INTRINSIC(SSE42, DotProduct, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_dppd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(SSE42, Extract, 16, 2, {INS_pextrb, INS_pextrb, INS_invalid, INS_invalid, INS_pextrd, INS_pextrd, INS_invalid, INS_invalid, INS_extractps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE42, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE42, FloorScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(SSE42, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(SSE42, HorizontalAddSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(SSE42, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(SSE42, HorizontalSubtractSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(SSE42, Insert, 16, 3, {INS_pinsrb, INS_pinsrb, INS_invalid, INS_invalid, INS_pinsrd, INS_pinsrd, INS_invalid, INS_invalid, INS_insertps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_CanBenefitFromConstantProp)
HARDWARE_INTRINSIC(SSE42, LoadAlignedVector128NonTemporal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(SSE42, LoadAndDuplicateToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE42, LoadDquVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(SSE42, Max, 16, 2, {INS_pmaxsb, INS_invalid, INS_invalid, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(SSE42, Min, 16, 2, {INS_pminsb, INS_invalid, INS_invalid, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(SSE42, MinHorizontal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_phminposuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(SSE42, MoveAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE42, MoveHighAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE42, MoveLowAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE42, MultipleSumAbsoluteDifferences, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(SSE42, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(SSE42, MultiplyAddAdjacent, 16, 2, {INS_invalid, INS_invalid, INS_pmaddubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE42, MultiplyHighRoundScale, 16, 2, {INS_invalid, INS_invalid, INS_pmulhrsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE42, MultiplyLow, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmulld, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(SSE42, PackUnsignedSaturate, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_packusdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE42, PopCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_popcnt, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(SSE42, RoundCurrentDirection, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE42, RoundCurrentDirectionScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(SSE42, RoundToNearestInteger, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE42, RoundToNearestIntegerScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(SSE42, RoundToNegativeInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE42, RoundToNegativeInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(SSE42, RoundToPositiveInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE42, RoundToPositiveInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(SSE42, RoundToZero, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE42, RoundToZeroScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits)
HARDWARE_INTRINSIC(SSE42, Shuffle, 16, 2, {INS_pshufb, INS_pshufb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(SSE42, Sign, 16, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(SSE42, TestC, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(SSE42, TestNotZAndNotC, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(SSE42, TestZ, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics|HW_Flag_NormalizeSmallTypeToInt)
#define LAST_NI_SSE42 NI_SSE42_TestZ
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// ISA Function name SIMD size NumArg Instructions Category Flags
// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE}
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// 64-bit only Intrinsics for SSE3, SSSE3, SSE41, SSE42, POPCNT
#define FIRST_NI_SSE42_X64 NI_SSE42_X64_Crc32
HARDWARE_INTRINSIC(SSE42_X64, Crc32, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_crc32, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_RmwIntrinsic)
HARDWARE_INTRINSIC(SSE42_X64, Extract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pextrq, INS_pextrq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE42_X64, Insert, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pinsrq, INS_pinsrq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_CanBenefitFromConstantProp)
HARDWARE_INTRINSIC(SSE42_X64, PopCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_popcnt, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
#define LAST_NI_SSE42_X64 NI_SSE42_X64_PopCount
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// ISA Function name SIMD size NumArg Instructions Category Flags
// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE}
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// Intrinsics for AVX
#define FIRST_NI_AVX NI_AVX_Add
HARDWARE_INTRINSIC(AVX, Add, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative)
@ -1208,8 +1175,8 @@ HARDWARE_INTRINSIC(GFNI_V512, GaloisFieldMultiply,
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// Special intrinsics that are generated during lowering
HARDWARE_INTRINSIC(X86Base, COMIS, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, PTEST, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(X86Base, UCOMIS, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(SSE42, PTEST, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics)
HARDWARE_INTRINSIC(AVX2, AndNotVector, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pandnd, INS_pandnd, INS_pandnd, INS_pandnd, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NormalizeSmallTypeToInt)
HARDWARE_INTRINSIC(AVX2, AndNotScalar, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoEvexSemantics)

View File

@ -20,8 +20,6 @@ static CORINFO_InstructionSet X64VersionOfIsa(CORINFO_InstructionSet isa)
{
case InstructionSet_X86Base:
return InstructionSet_X86Base_X64;
case InstructionSet_SSE42:
return InstructionSet_SSE42_X64;
case InstructionSet_AVX:
return InstructionSet_AVX_X64;
case InstructionSet_AVX2:
@ -333,7 +331,7 @@ CORINFO_InstructionSet Compiler::lookupInstructionSet(const char* className)
}
else if (strcmp(className + 1, "opcnt") == 0)
{
return InstructionSet_SSE42;
return InstructionSet_X86Base;
}
}
else if (className[0] == 'S')
@ -350,20 +348,20 @@ CORINFO_InstructionSet Compiler::lookupInstructionSet(const char* className)
}
else if (strcmp(className + 3, "3") == 0)
{
return InstructionSet_SSE42;
return InstructionSet_X86Base;
}
else if (strcmp(className + 3, "41") == 0)
{
return InstructionSet_SSE42;
return InstructionSet_X86Base;
}
else if (strcmp(className + 3, "42") == 0)
{
return InstructionSet_SSE42;
return InstructionSet_X86Base;
}
}
else if (strcmp(className + 1, "sse3") == 0)
{
return InstructionSet_SSE42;
return InstructionSet_X86Base;
}
}
else if (className[0] == 'V')
@ -1054,54 +1052,54 @@ int HWIntrinsicInfo::lookupIval(Compiler* comp, NamedIntrinsic id, var_types sim
return static_cast<int>(FloatComparisonMode::UnorderedNonSignaling);
}
case NI_SSE42_Ceiling:
case NI_SSE42_CeilingScalar:
case NI_X86Base_Ceiling:
case NI_X86Base_CeilingScalar:
case NI_AVX_Ceiling:
{
FALLTHROUGH;
}
case NI_SSE42_RoundToPositiveInfinity:
case NI_SSE42_RoundToPositiveInfinityScalar:
case NI_X86Base_RoundToPositiveInfinity:
case NI_X86Base_RoundToPositiveInfinityScalar:
case NI_AVX_RoundToPositiveInfinity:
{
assert(varTypeIsFloating(simdBaseType));
return static_cast<int>(FloatRoundingMode::ToPositiveInfinity);
}
case NI_SSE42_Floor:
case NI_SSE42_FloorScalar:
case NI_X86Base_Floor:
case NI_X86Base_FloorScalar:
case NI_AVX_Floor:
{
FALLTHROUGH;
}
case NI_SSE42_RoundToNegativeInfinity:
case NI_SSE42_RoundToNegativeInfinityScalar:
case NI_X86Base_RoundToNegativeInfinity:
case NI_X86Base_RoundToNegativeInfinityScalar:
case NI_AVX_RoundToNegativeInfinity:
{
assert(varTypeIsFloating(simdBaseType));
return static_cast<int>(FloatRoundingMode::ToNegativeInfinity);
}
case NI_SSE42_RoundCurrentDirection:
case NI_SSE42_RoundCurrentDirectionScalar:
case NI_X86Base_RoundCurrentDirection:
case NI_X86Base_RoundCurrentDirectionScalar:
case NI_AVX_RoundCurrentDirection:
{
assert(varTypeIsFloating(simdBaseType));
return static_cast<int>(FloatRoundingMode::CurrentDirection);
}
case NI_SSE42_RoundToNearestInteger:
case NI_SSE42_RoundToNearestIntegerScalar:
case NI_X86Base_RoundToNearestInteger:
case NI_X86Base_RoundToNearestIntegerScalar:
case NI_AVX_RoundToNearestInteger:
{
assert(varTypeIsFloating(simdBaseType));
return static_cast<int>(FloatRoundingMode::ToNearestInteger);
}
case NI_SSE42_RoundToZero:
case NI_SSE42_RoundToZeroScalar:
case NI_X86Base_RoundToZero:
case NI_X86Base_RoundToZeroScalar:
case NI_AVX_RoundToZero:
{
assert(varTypeIsFloating(simdBaseType));
@ -1803,11 +1801,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}
if ((simdSize < 32) && !compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
break;
}
op1 = impSIMDPopStack();
retNode = gtNewSimdCeilNode(retType, op1, simdBaseJitType, simdSize);
break;
@ -1862,11 +1855,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
assert(sig->numArgs == 1);
assert(simdBaseType == TYP_FLOAT);
if (compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
op1 = impSIMDPopStack();
retNode = gtNewSimdCvtNode(retType, op1, CORINFO_TYPE_INT, simdBaseJitType, simdSize);
}
op1 = impSIMDPopStack();
retNode = gtNewSimdCvtNode(retType, op1, CORINFO_TYPE_INT, simdBaseJitType, simdSize);
break;
}
@ -2326,8 +2316,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
op2 = impSIMDPopStack();
op1 = impSIMDPopStack();
if ((simdSize == 64) || varTypeIsByte(simdBaseType) || varTypeIsLong(simdBaseType) ||
(varTypeIsInt(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_SSE42)))
if ((simdSize == 64) || varTypeIsByte(simdBaseType) || varTypeIsLong(simdBaseType))
{
// The lowering for Dot doesn't handle these cases, so import as Sum(left * right)
retNode = gtNewSimdBinOpNode(GT_MUL, simdType, op1, op2, simdBaseJitType, simdSize);
@ -2467,14 +2456,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
assert(op1 != nullptr);
retNode = gtNewSimdHWIntrinsicNode(retType, op1, moveMaskIntrinsic, simdBaseJitType, simdSize);
if ((simdSize == 16) && varTypeIsShort(simdBaseType))
{
if (!compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
retNode->AsHWIntrinsic()->SetMethodHandle(this, method R2RARG(*entryPoint));
}
}
}
break;
}
@ -2491,11 +2472,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}
if ((simdSize < 32) && !compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
break;
}
op1 = impSIMDPopStack();
retNode = gtNewSimdFloorNode(retType, op1, simdBaseJitType, simdSize);
break;
@ -2561,41 +2537,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 2);
op2 = impStackTop(0).val;
switch (simdBaseType)
{
case TYP_BYTE:
case TYP_UBYTE:
case TYP_INT:
case TYP_UINT:
case TYP_LONG:
case TYP_ULONG:
{
if (!op2->IsIntegralConst(0) && !compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
// Using software fallback if simdBaseType is not supported by hardware
return nullptr;
}
break;
}
case TYP_DOUBLE:
case TYP_FLOAT:
case TYP_SHORT:
case TYP_USHORT:
{
// short/ushort/float/double is supported by SSE2
break;
}
default:
{
unreached();
}
}
impPopStack();
op2 = impPopStack().val;
op1 = impSIMDPopStack();
retNode = gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize);
@ -2752,10 +2694,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 1);
if ((simdSize == 16) && !compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
break;
}
if ((simdSize == 32) && !compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
break;
@ -3586,11 +3524,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}
if ((simdSize < 32) && !compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
break;
}
op1 = impSIMDPopStack();
retNode = gtNewSimdRoundNode(retType, op1, simdBaseJitType, simdSize);
break;
@ -3981,11 +3914,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}
if ((simdSize < 32) && !compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
break;
}
op1 = impSIMDPopStack();
retNode = gtNewSimdTruncNode(retType, op1, simdBaseJitType, simdSize);
break;
@ -4040,42 +3968,17 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
case NI_Vector512_WithElement:
{
assert(sig->numArgs == 3);
GenTree* indexOp = impStackTop(1).val;
switch (simdBaseType)
if (varTypeIsLong(simdBaseType))
{
// Using software fallback if simdBaseType is not supported by hardware
case TYP_BYTE:
case TYP_UBYTE:
case TYP_INT:
case TYP_UINT:
if (!compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
return nullptr;
}
break;
case TYP_LONG:
case TYP_ULONG:
if (!compOpportunisticallyDependsOn(InstructionSet_SSE42_X64))
{
return nullptr;
}
break;
case TYP_DOUBLE:
case TYP_FLOAT:
case TYP_SHORT:
case TYP_USHORT:
// short/ushort/float/double is supported by SSE2
break;
default:
unreached();
if (!compOpportunisticallyDependsOn(InstructionSet_X86Base_X64))
{
return nullptr;
}
}
GenTree* valueOp = impPopStack().val;
impPopStack(); // Pop the indexOp now that we know its valid
GenTree* valueOp = impPopStack().val;
GenTree* indexOp = impPopStack().val;
GenTree* vectorOp = impSIMDPopStack();
retNode = gtNewSimdWithElementNode(retType, vectorOp, indexOp, valueOp, simdBaseJitType, simdSize);
@ -4914,7 +4817,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}
case NI_SSE42_BlendVariable:
case NI_X86Base_BlendVariable:
case NI_AVX_BlendVariable:
case NI_AVX2_BlendVariable:
case NI_AVX512_BlendVariable:
@ -5021,7 +4924,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
}
case NI_X86Base_CompareEqual:
case NI_SSE42_CompareEqual:
case NI_AVX_CompareEqual:
case NI_AVX2_CompareEqual:
case NI_AVX512_CompareEqual:
@ -5042,7 +4944,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
}
case NI_X86Base_CompareGreaterThan:
case NI_SSE42_CompareGreaterThan:
case NI_AVX_CompareGreaterThan:
case NI_AVX2_CompareGreaterThan:
case NI_AVX512_CompareGreaterThan:
@ -5082,7 +4983,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
}
case NI_X86Base_CompareLessThan:
case NI_SSE42_CompareLessThan:
case NI_AVX_CompareLessThan:
case NI_AVX2_CompareLessThan:
case NI_AVX512_CompareLessThan:

View File

@ -5908,27 +5908,24 @@ GenTree* Compiler::impPrimitiveNamedIntrinsic(NamedIntrinsic intrinsic,
#if defined(FEATURE_HW_INTRINSICS)
#if defined(TARGET_XARCH)
if (compOpportunisticallyDependsOn(InstructionSet_SSE42))
GenTree* op2 = impPopStack().val;
GenTree* op1 = impPopStack().val;
if (varTypeIsLong(baseType))
{
GenTree* op2 = impPopStack().val;
GenTree* op1 = impPopStack().val;
if (varTypeIsLong(baseType))
{
hwintrinsic = NI_SSE42_X64_Crc32;
op1 = gtFoldExpr(gtNewCastNode(baseType, op1, /* unsigned */ true, baseType));
}
else
{
hwintrinsic = NI_SSE42_Crc32;
baseType = genActualType(baseType);
}
result = gtNewScalarHWIntrinsicNode(baseType, op1, op2, hwintrinsic);
// We use the simdBaseJitType to bring the type of the second argument to codegen
result->AsHWIntrinsic()->SetSimdBaseJitType(baseJitType);
hwintrinsic = NI_X86Base_X64_Crc32;
op1 = gtFoldExpr(gtNewCastNode(baseType, op1, /* unsigned */ true, baseType));
}
else
{
hwintrinsic = NI_X86Base_Crc32;
baseType = genActualType(baseType);
}
result = gtNewScalarHWIntrinsicNode(baseType, op1, op2, hwintrinsic);
// We use the simdBaseJitType to bring the type of the second argument to codegen
result->AsHWIntrinsic()->SetSimdBaseJitType(baseJitType);
#elif defined(TARGET_ARM64)
if (compOpportunisticallyDependsOn(InstructionSet_Crc32))
{
@ -6173,14 +6170,11 @@ GenTree* Compiler::impPrimitiveNamedIntrinsic(NamedIntrinsic intrinsic,
}
#elif defined(FEATURE_HW_INTRINSICS)
#if defined(TARGET_XARCH)
if (compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
// Pop the value from the stack
impPopStack();
// Pop the value from the stack
impPopStack();
hwintrinsic = varTypeIsLong(baseType) ? NI_SSE42_X64_PopCount : NI_SSE42_PopCount;
result = gtNewScalarHWIntrinsicNode(baseType, op1, hwintrinsic);
}
hwintrinsic = varTypeIsLong(baseType) ? NI_X86Base_X64_PopCount : NI_X86Base_PopCount;
result = gtNewScalarHWIntrinsicNode(baseType, op1, hwintrinsic);
#elif defined(TARGET_ARM64)
// TODO-ARM64-CQ: PopCount should be handled as an intrinsic for non-constant cases
#endif // TARGET_*
@ -8153,6 +8147,8 @@ bool Compiler::IsTargetIntrinsic(NamedIntrinsic intrinsicName)
// instructions to directly compute round/ceiling/floor/truncate.
case NI_System_Math_Abs:
case NI_System_Math_Ceiling:
case NI_System_Math_Floor:
case NI_System_Math_Max:
case NI_System_Math_MaxMagnitude:
case NI_System_Math_MaxMagnitudeNumber:
@ -8166,14 +8162,10 @@ bool Compiler::IsTargetIntrinsic(NamedIntrinsic intrinsicName)
case NI_System_Math_MultiplyAddEstimate:
case NI_System_Math_ReciprocalEstimate:
case NI_System_Math_ReciprocalSqrtEstimate:
case NI_System_Math_Sqrt:
return true;
case NI_System_Math_Ceiling:
case NI_System_Math_Floor:
case NI_System_Math_Round:
case NI_System_Math_Sqrt:
case NI_System_Math_Truncate:
return compOpportunisticallyDependsOn(InstructionSet_SSE42);
return true;
case NI_System_Math_FusedMultiplyAdd:
return compOpportunisticallyDependsOn(InstructionSet_AVX2);

View File

@ -1103,7 +1103,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(instruction ins, GenTree* op)
var_types simdBaseType = hwintrinsic->GetSimdBaseType();
switch (intrinsicId)
{
case NI_SSE42_LoadAndDuplicateToVector128:
case NI_X86Base_LoadAndDuplicateToVector128:
case NI_AVX_BroadcastScalarToVector128:
case NI_AVX_BroadcastScalarToVector256:
{
@ -1127,13 +1127,13 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(instruction ins, GenTree* op)
}
}
case NI_SSE42_MoveAndDuplicate:
case NI_X86Base_MoveAndDuplicate:
case NI_AVX2_BroadcastScalarToVector128:
case NI_AVX2_BroadcastScalarToVector256:
case NI_AVX512_BroadcastScalarToVector512:
{
assert(hwintrinsic->isContained());
if (intrinsicId == NI_SSE42_MoveAndDuplicate)
if (intrinsicId == NI_X86Base_MoveAndDuplicate)
{
assert(simdBaseType == TYP_DOUBLE);
}

View File

@ -217,15 +217,21 @@ INSTMUL(imul_31, "imul", IUM_RD, BAD_CODE, 0xD54400003868
#define VEX3FLT(c1,c2) PACK4(c1, 0xc5, 0x02, c2)
#define FIRST_SSE_INSTRUCTION INS_addpd
// Instructions for SSE, SSE2
// Instructions for SSE, SSE2, SSE3, SSSE3, SSE41, SSE42, POPCNT
INST3(addpd, "vaddpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x58), 4C, 2X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed doubles
INST3(addps, "vaddps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x58), 4C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed singles
INST3(addsd, "vaddsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x58), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar doubles
INST3(addss, "vaddss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x58), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar singles
INST3(addsubpd, "vaddsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD0), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add/Subtract packed doubles
INST3(addsubps, "vaddsubps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xD0), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add/Subtract packed singles
INST3(andnpd, "vandnpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x55), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed doubles
INST3(andnps, "vandnps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x55), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed singles
INST3(andpd, "vandpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x54), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed doubles
INST3(andps, "vandps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x54), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed singles
INST3(blendpd, "vblendpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0D), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Double Precision Floating-Point Values
INST3(blendps, "vblendps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0C), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Single Precision Floating-Point Values
INST3(blendvpd, "blendvpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x15), 1C, 2X, INS_TT_FULL_MEM, REX_W0) // Variable Blend Packed Doubles
INST3(blendvps, "blendvps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x14), 1C, 2X, INS_TT_FULL_MEM, REX_W0) // Variable Blend Packed Singles
INST3(cmppd, "vcmppd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC2), 4C, 2X, INS_TT_FULL, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // compare packed doubles
INST3(cmpps, "vcmpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0xC2), 4C, 2X, INS_TT_FULL, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // compare packed singles
INST3(cmpsd, "vcmpsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xC2), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // compare scalar doubles
@ -258,6 +264,15 @@ INST3(divpd, "vdivpd", IUM_WR, BAD_CODE, BAD_CODE,
INST3(divps, "vdivps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5E), 11C, 3C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Divide packed singles
INST3(divsd, "vdivsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5E), 13C, 4C, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Divide scalar doubles
INST3(divss, "vdivss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5E), 11C, 3C, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Divide scalar singles
INST3(dppd, "vdppd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x41), 9C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed dot product of two double vector regs
INST3(dpps, "vdpps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x40), 13C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed dot product of two float vector regs
INST3(extractps, "vextractps", IUM_WR, SSE3A(0x17), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Extract Packed Floating-Point Values
INST3(haddpd, "vhaddpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7C), 6C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal add packed doubles
INST3(haddps, "vhaddps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7C), 6C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal add packed floats
INST3(hsubpd, "vhsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7D), 6C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal subtract packed doubles
INST3(hsubps, "vhsubps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7D), 6C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal subtract packed floats
INST3(insertps, "vinsertps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x21), 1C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert packed single precision float value
INST3(lddqu, "vlddqu", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xF0), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX) // Load Unaligned integer
INST3(lfence, "lfence", IUM_RD, 0x000FE8AE, BAD_CODE, BAD_CODE, ZERO, 4C, INS_TT_NONE, REX_WIG)
INST3(maskmovdqu, "vmaskmovdqu", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF7), 400C, 6C, INS_TT_NONE, REX_WIG | Encoding_VEX)
INST3(maxpd, "vmaxpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5F), 4C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum packed doubles
@ -273,6 +288,7 @@ INST3(movapd, "vmovapd", IUM_WR, PCKDBL(0x29), BAD_CODE,
INST3(movaps, "vmovaps", IUM_WR, PCKFLT(0x29), BAD_CODE, PCKFLT(0x28), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX)
INST3(movd32, "vmovd", IUM_WR, PCKDBL(0x7E), BAD_CODE, PCKDBL(0x6E), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | Encoding_REX2) // Move DWORD between xmm regs <-> memory/r32 regs
INST3(movd64, "vmovq", IUM_WR, PCKDBL(0x7E), BAD_CODE, PCKDBL(0x6E), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | Encoding_REX2) // Move QWORD between xmm regs <-> memory/r64 regs
INST3(movddup, "vmovddup", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x12), ILLEGAL, ILLEGAL, INS_TT_MOVDDUP, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) // Replicate Double FP Values
INST3(movdqa32, "vmovdqa", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | Encoding_REX2 | INS_FLAGS_HasPseudoName)
INST3(movdqu32, "vmovdqu", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | Encoding_REX2 | INS_FLAGS_HasPseudoName)
INST3(movhlps, "vmovhlps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x12), 1C, 1C, INS_TT_NONE, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction)
@ -284,23 +300,31 @@ INST3(movlps, "vmovlps", IUM_WR, PCKFLT(0x13), BAD_CODE,
INST3(movmskpd, "vmovmskpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x50), ILLEGAL, ILLEGAL, INS_TT_NONE, REX_WIG | Encoding_VEX) // Extract 2-bit sign mask from xmm and store in reg. The upper bits of r32 or r64 are filled with zeros.
INST3(movmskps, "vmovmskps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x50), ILLEGAL, ILLEGAL, INS_TT_NONE, REX_WIG | Encoding_VEX)
INST3(movntdq, "vmovntdq", IUM_WR, PCKDBL(0xE7), BAD_CODE, BAD_CODE, 400C, 1C, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX)
INST3(movntdqa, "vmovntdqa", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2A), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Load Double Quadword Non-Temporal Aligned Hint
INST3(movnti32, "movnti", IUM_WR, PCKFLT(0xC3), BAD_CODE, BAD_CODE, 400C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_REX2)
INST3(movnti64, "movnti", IUM_WR, PCKFLT(0xC3), BAD_CODE, BAD_CODE, 400C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_REX2)
INST3(movntpd, "vmovntpd", IUM_WR, PCKDBL(0x2B), BAD_CODE, BAD_CODE, 400C, 1C, INS_TT_FULL_MEM, REX_W1_EVEX | Encoding_VEX | Encoding_EVEX)
INST3(movntps, "vmovntps", IUM_WR, PCKFLT(0x2B), BAD_CODE, BAD_CODE, 400C, 1C, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX)
INST3(movq, "vmovq", IUM_WR, PCKDBL(0xD6), BAD_CODE, SSEFLT(0x7E), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | Encoding_REX2) // Move Quadword between memory/mm <-> regs
INST3(movsd_simd, "vmovsd", IUM_WR, SSEDBL(0x11), BAD_CODE, SSEDBL(0x10), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction)
INST3(movshdup, "vmovshdup", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x16), 1C, 1C, INS_TT_FULL_MEM, KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Replicate odd-indexed Single FP Values
INST3(movsldup, "vmovsldup", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x12), 1C, 1C, INS_TT_FULL_MEM, KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Replicate even-indexed Single FP Values
INST3(movss, "vmovss", IUM_WR, SSEFLT(0x11), BAD_CODE, SSEFLT(0x10), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction)
INST3(movupd, "vmovupd", IUM_WR, PCKDBL(0x11), BAD_CODE, PCKDBL(0x10), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W1_EVEX | Encoding_VEX | Encoding_EVEX)
INST3(movups, "vmovups", IUM_WR, PCKFLT(0x11), BAD_CODE, PCKFLT(0x10), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX)
INST3(mpsadbw, "vmpsadbw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x42), 4C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Compute Multiple Packed Sums of Absolute Difference
INST3(mulpd, "vmulpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x59), 4C, 2X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed doubles
INST3(mulps, "vmulps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x59), 4C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed singles
INST3(mulsd, "vmulsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x59), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply scalar doubles
INST3(mulss, "vmulss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x59), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply scalar single
INST3(orpd, "vorpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x56), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Or packed doubles
INST3(orps, "vorps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x56), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Or packed singles
INST3(pabsb, "vpabsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1C), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed absolute value of bytes
INST3(pabsd, "vpabsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1E), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed absolute value of 32-bit integers
INST3(pabsw, "vpabsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1D), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed absolute value of 16-bit integers
INST3(packssdw, "vpackssdw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6B), 1C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base8 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) int to short with saturation
INST3(packsswb, "vpacksswb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x63), 1C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) short to byte with saturation
INST3(packusdw, "vpackusdw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2B), 1C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base8 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) int to unsigned short with saturation
INST3(packuswb, "vpackuswb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x67), 1C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) short to unsigned byte with saturation
INST3(paddb, "vpaddb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFC), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed byte integers
INST3(paddd, "vpaddd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFE), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed double-word (32-bit) integers
@ -310,26 +334,68 @@ INST3(paddsw, "vpaddsw", IUM_WR, BAD_CODE, BAD_CODE,
INST3(paddusb, "vpaddusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDC), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed unsigned byte integers and saturate the results
INST3(paddusw, "vpaddusw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDD), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed unsigned word integers and saturate the results
INST3(paddw, "vpaddw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFD), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed word (16-bit) integers
INST3(palignr, "vpalignr", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0F), 1C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Align Right
INST3(pandd, "vpand", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Packed bit-wise AND of two xmm regs
INST3(pandnd, "vpandn", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Packed bit-wise AND NOT of two xmm regs
INST3(pavgb, "vpavgb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE0), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Average of packed byte integers
INST3(pavgw, "vpavgw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE3), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Average of packed word integers
INST3(pblendvb, "pblendvb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x10), 1C, 2X, INS_TT_FULL_MEM, REX_W0) // Variable Blend Packed Bytes
INST3(pblendw, "vpblendw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0E), 1C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Words
INST3(pcmpeqb, "vpcmpeqb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x74), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit integers for equality
INST3(pcmpeqd, "vpcmpeqd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x76), 1C, 2X, INS_TT_FULL, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit integers for equality
INST3(pcmpeqq, "vpcmpeqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x29), 1C, 2X, INS_TT_FULL, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality
INST3(pcmpeqw, "vpcmpeqw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x75), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit integers for equality
INST3(pcmpgtb, "vpcmpgtb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x64), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit signed integers for greater than
INST3(pcmpgtd, "vpcmpgtd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x66), 1C, 2X, INS_TT_FULL, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 32-bit signed integers for greater than
INST3(pcmpgtq, "vpcmpgtq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x37), 3C, 1C, INS_TT_FULL, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality
INST3(pcmpgtw, "vpcmpgtw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x65), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 16-bit signed integers for greater than
INST3(pextrw, "vpextrw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC5), 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract 16-bit value into a r32 with zero extended to 32-bits
INST3(pextrb, "vpextrb", IUM_WR, SSE3A(0x14), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_8Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract Byte
INST3(pextrd, "vpextrd", IUM_WR, SSE3A(0x16), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract Dword
INST3(pextrq, "vpextrq", IUM_WR, SSE3A(0x16), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX) // Extract Qword
INST3(phaddd, "vphaddd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x02), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add
INST3(pextrw, "vpextrw", IUM_WR, SSE3A(0x15), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract Word
INST3(phaddsw, "vphaddsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x03), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add of 16-bit integers with saturation
INST3(phaddw, "vphaddw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x01), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add of 16-bit integers
INST3(phminposuw, "vphminposuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x41), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX) // Packed Horizontal Word Minimum
INST3(phsubd, "vphsubd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x06), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 32-bit integers
INST3(phsubsw, "vphsubsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x07), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 16-bit integers with saturation
INST3(phsubw, "vphsubw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x05), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 16-bit integers
INST3(pinsrb, "vpinsrb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x20), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_8Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Byte
INST3(pinsrd, "vpinsrd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x22), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Dword
INST3(pinsrq, "vpinsrq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x22), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Qword
INST3(pinsrw, "vpinsrw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC4), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert word at index
INST3(pmaddubsw, "vpmaddubsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x04), 5C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Packed Signed and Unsigned Bytes
INST3(pmaddwd, "vpmaddwd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF5), 5C, 2X, INS_TT_FULL_MEM, KMask_Base4 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst
INST3(pmaxsb, "vpmaxsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3C), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum signed bytes
INST3(pmaxsd, "vpmaxsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3D), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 32-bit signed integers
INST3(pmaxsw, "vpmaxsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEE), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum signed words
INST3(pmaxub, "vpmaxub", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDE), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum unsigned bytes
INST3(pmaxud, "vpmaxud", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3F), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 32-bit unsigned integers
INST3(pmaxuw, "vpmaxuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3E), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 16-bit unsigned integers
INST3(pminsb, "vpminsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x38), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum signed bytes
INST3(pminsd, "vpminsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x39), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 32-bit signed integers
INST3(pminsw, "vpminsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEA), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum signed words
INST3(pminub, "vpminub", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDA), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum unsigned bytes
INST3(pminud, "vpminud", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3B), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 32-bit unsigned integers
INST3(pminuw, "vpminuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3A), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 16-bit unsigned integers
INST3(pmovmskb, "vpmovmskb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD7), ILLEGAL, ILLEGAL, INS_TT_NONE, REX_WIG | Encoding_VEX) // Move the MSB bits of all bytes in a xmm reg to an int reg
INST3(pmovsxbd, "vpmovsxbd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x21), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_8Bit | KMask_Base4 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend byte to int
INST3(pmovsxbq, "vpmovsxbq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x22), ILLEGAL, ILLEGAL, INS_TT_EIGHTH_MEM, Input_8Bit | KMask_Base2 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend byte to long
INST3(pmovsxbw, "vpmovsxbw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x20), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_8Bit | KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend byte to short
INST3(pmovsxdq, "vpmovsxdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x25), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_32Bit | KMask_Base2 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed sign extend int to long
INST3(pmovsxwd, "vpmovsxwd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x23), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_16Bit | KMask_Base4 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend short to int
INST3(pmovsxwq, "vpmovsxwq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x24), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_16Bit | KMask_Base2 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend short to long
INST3(pmovzxbd, "vpmovzxbd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x31), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_8Bit | KMask_Base4 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend byte to intg
INST3(pmovzxbq, "vpmovzxbq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x32), ILLEGAL, ILLEGAL, INS_TT_EIGHTH_MEM, Input_8Bit | KMask_Base2 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend byte to lon
INST3(pmovzxbw, "vpmovzxbw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x30), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_8Bit | KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend byte to short
INST3(pmovzxdq, "vpmovzxdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x35), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_32Bit | KMask_Base2 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed zero extend int to long
INST3(pmovzxwd, "vpmovzxwd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x33), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_16Bit | KMask_Base4 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend short to int
INST3(pmovzxwq, "vpmovzxwq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x34), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_16Bit | KMask_Base2 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend short to long
INST3(pmuldq, "vpmuldq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x28), 5C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit signed integers and store 64-bit result
INST3(pmulhrsw, "vpmulhrsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0B), 5C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Multiply High with Round and Scale
INST3(pmulhuw, "vpmulhuw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE4), 5C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply high the packed 16-bit unsigned integers
INST3(pmulhw, "vpmulhw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE5), 5C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply high the packed 16-bit signed integers
INST3(pmulld, "vpmulld", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x40), 10C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 32 bit unsigned integers and store lower 32 bits of each result
INST3(pmullw, "vpmullw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD5), 5C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 16 bit unsigned integers and store lower 16 bits of each result
INST3(pmuludq, "vpmuludq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF4), 5C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit unsigned integers and store 64-bit result
INST3(pord, "vpor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Packed bit-wise OR of two xmm regs
@ -338,9 +404,13 @@ INST3(prefetcht0, "prefetcht0", IUM_RD, 0x000F0818, BAD_CODE,
INST3(prefetcht1, "prefetcht1", IUM_RD, 0x000F1018, BAD_CODE, BAD_CODE, ZERO, 2X, INS_TT_TUPLE1_FIXED, Input_8Bit | REX_WIG | Encoding_REX2)
INST3(prefetcht2, "prefetcht2", IUM_RD, 0x000F1818, BAD_CODE, BAD_CODE, ZERO, 2X, INS_TT_TUPLE1_FIXED, Input_8Bit | REX_WIG | Encoding_REX2)
INST3(psadbw, "vpsadbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF6), 3C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Compute the sum of absolute differences of packed unsigned 8-bit integers
INST3(pshufb, "vpshufb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x00), 1C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Shuffle Bytes
INST3(pshufd, "vpshufd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x70), 1C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed shuffle of 32-bit integers
INST3(pshufhw, "vpshufhw", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x70), 1C, 1C, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Shuffle the high words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1.
INST3(pshuflw, "vpshuflw", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x70), 1C, 1C, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Shuffle the low words in xmm2/m128 based on the encoding in imm8 and store the result in xmm1.
INST3(psignb, "vpsignb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x08), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN
INST3(psignd, "vpsignd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0A), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN
INST3(psignw, "vpsignw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x09), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN
INST3(pslld, "vpslld", IUM_WR, BAD_CODE, PCKDBL(0x72), PCKDBL(0xF2), ILLEGAL, ILLEGAL, INS_TT_FULL | INS_TT_MEM128, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift left logical of 32-bit integers
INST3(pslldq, "vpslldq", IUM_WR, BAD_CODE, PCKDBL(0x73), BAD_CODE, 1C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift left logical of xmm reg by given number of bytes
INST3(psllq, "vpsllq", IUM_WR, BAD_CODE, PCKDBL(0x73), PCKDBL(0xF3), ILLEGAL, ILLEGAL, INS_TT_FULL | INS_TT_MEM128, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift left logical of 64-bit integers
@ -359,6 +429,7 @@ INST3(psubsw, "vpsubsw", IUM_WR, BAD_CODE, BAD_CODE,
INST3(psubusb, "vpsubusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD8), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation
INST3(psubusw, "vpsubusw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD9), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation
INST3(psubw, "vpsubw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF9), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed word (16-bit) integers
INST3(ptest, "vptest", IUM_RD, BAD_CODE, BAD_CODE, SSE38(0x17), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF) // Packed logical compare
INST3(punpckhbw, "vpunpckhbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x68), 1C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (hi)
INST3(punpckhdq, "vpunpckhdq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6A), 1C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction)
INST3(punpckhqdq, "vpunpckhqdq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6D), 1C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen uint to ulong (hi)
@ -370,6 +441,10 @@ INST3(punpcklwd, "vpunpcklwd", IUM_WR, BAD_CODE, BAD_CODE,
INST3(pxord, "vpxor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Packed bit-wise XOR of two xmm regs
INST3(rcpps, "vrcpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x53), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX) // Reciprocal of packed singles
INST3(rcpss, "vrcpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x53), 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Reciprocal of scalar single
INST3(roundpd, "vroundpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x09), 8C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_FLAGS_HasPseudoName) // Round packed double precision floating-point values
INST3(roundps, "vroundps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x08), 8C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_FLAGS_HasPseudoName) // Round packed single precision floating-point values
INST3(roundsd, "vroundsd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0B), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Round scalar double precision floating-point values
INST3(roundss, "vroundss", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0A), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Round scalar single precision floating-point values
INST3(rsqrtps, "vrsqrtps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x52), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX) // Reciprocal Sqrt of packed singles
INST3(rsqrtss, "vrsqrtss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x52), 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Reciprocal Sqrt of scalar single
INST3(sfence, "sfence", IUM_RD, 0x000FF8AE, BAD_CODE, BAD_CODE, ZERO, 6C, INS_TT_NONE, REX_WIG)
@ -392,84 +467,6 @@ INST3(unpcklps, "vunpcklps", IUM_WR, BAD_CODE, BAD_CODE,
INST3(xorpd, "vxorpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x57), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // XOR packed doubles
INST3(xorps, "vxorps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x57), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // XOR packed singles
// Instructions for SSE3, SSSE3, SSE41, SSE42, POPCNT
INST3(addsubpd, "vaddsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD0), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add/Subtract packed doubles
INST3(addsubps, "vaddsubps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xD0), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add/Subtract packed singles
INST3(blendpd, "vblendpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0D), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Double Precision Floating-Point Values
INST3(blendps, "vblendps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0C), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Single Precision Floating-Point Values
INST3(blendvpd, "blendvpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x15), 1C, 2X, INS_TT_FULL_MEM, REX_W0) // Variable Blend Packed Doubles
INST3(blendvps, "blendvps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x14), 1C, 2X, INS_TT_FULL_MEM, REX_W0) // Variable Blend Packed Singles
INST3(dppd, "vdppd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x41), 9C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed dot product of two double vector regs
INST3(dpps, "vdpps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x40), 13C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed dot product of two float vector regs
INST3(extractps, "vextractps", IUM_WR, SSE3A(0x17), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_WIG | Encoding_VEX | Encoding_EVEX) // Extract Packed Floating-Point Values
INST3(haddpd, "vhaddpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7C), 6C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal add packed doubles
INST3(haddps, "vhaddps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7C), 6C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal add packed floats
INST3(hsubpd, "vhsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7D), 6C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal subtract packed doubles
INST3(hsubps, "vhsubps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7D), 6C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal subtract packed floats
INST3(insertps, "vinsertps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x21), 1C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert packed single precision float value
INST3(lddqu, "vlddqu", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xF0), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX) // Load Unaligned integer
INST3(movddup, "vmovddup", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x12), ILLEGAL, ILLEGAL, INS_TT_MOVDDUP, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) // Replicate Double FP Values
INST3(movntdqa, "vmovntdqa", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2A), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Load Double Quadword Non-Temporal Aligned Hint
INST3(movshdup, "vmovshdup", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x16), 1C, 1C, INS_TT_FULL_MEM, KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Replicate odd-indexed Single FP Values
INST3(movsldup, "vmovsldup", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x12), 1C, 1C, INS_TT_FULL_MEM, KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Replicate even-indexed Single FP Values
INST3(mpsadbw, "vmpsadbw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x42), 4C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Compute Multiple Packed Sums of Absolute Difference
INST3(pabsb, "vpabsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1C), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed absolute value of bytes
INST3(pabsd, "vpabsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1E), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed absolute value of 32-bit integers
INST3(pabsw, "vpabsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1D), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed absolute value of 16-bit integers
INST3(packusdw, "vpackusdw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2B), 1C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base8 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) int to unsigned short with saturation
INST3(palignr, "vpalignr", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0F), 1C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Align Right
INST3(pblendvb, "pblendvb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x10), 1C, 2X, INS_TT_FULL_MEM, REX_W0) // Variable Blend Packed Bytes
INST3(pblendw, "vpblendw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0E), 1C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Words
INST3(pcmpeqq, "vpcmpeqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x29), 1C, 2X, INS_TT_FULL, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality
INST3(pcmpgtq, "vpcmpgtq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x37), 3C, 1C, INS_TT_FULL, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 64-bit integers for equality
INST3(pextrb, "vpextrb", IUM_WR, SSE3A(0x14), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_8Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract Byte
INST3(pextrd, "vpextrd", IUM_WR, SSE3A(0x16), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract Dword
INST3(pextrq, "vpextrq", IUM_WR, SSE3A(0x16), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX) // Extract Qword
INST3(pextrw_sse42, "vpextrw", IUM_WR, SSE3A(0x15), BAD_CODE, BAD_CODE, 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // Extract Word
INST3(phaddd, "vphaddd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x02), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add
INST3(phaddsw, "vphaddsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x03), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add of 16-bit integers with saturation
INST3(phaddw, "vphaddw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x01), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal add of 16-bit integers
INST3(phminposuw, "vphminposuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x41), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX) // Packed Horizontal Word Minimum
INST3(phsubd, "vphsubd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x06), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 32-bit integers
INST3(phsubsw, "vphsubsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x07), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 16-bit integers with saturation
INST3(phsubw, "vphsubw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x05), 3C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed horizontal subtract of 16-bit integers
INST3(pinsrb, "vpinsrb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x20), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_8Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Byte
INST3(pinsrd, "vpinsrd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x22), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Dword
INST3(pinsrq, "vpinsrq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x22), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert Qword
INST3(pmaddubsw, "vpmaddubsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x04), 5C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply and Add Packed Signed and Unsigned Bytes
INST3(pmaxsb, "vpmaxsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3C), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum signed bytes
INST3(pmaxsd, "vpmaxsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3D), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 32-bit signed integers
INST3(pmaxud, "vpmaxud", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3F), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 32-bit unsigned integers
INST3(pmaxuw, "vpmaxuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3E), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 16-bit unsigned integers
INST3(pminsb, "vpminsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x38), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum signed bytes
INST3(pminsd, "vpminsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x39), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 32-bit signed integers
INST3(pminud, "vpminud", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3B), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 32-bit unsigned integers
INST3(pminuw, "vpminuw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3A), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 16-bit unsigned integers
INST3(pmovsxbd, "vpmovsxbd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x21), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_8Bit | KMask_Base4 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend byte to int
INST3(pmovsxbq, "vpmovsxbq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x22), ILLEGAL, ILLEGAL, INS_TT_EIGHTH_MEM, Input_8Bit | KMask_Base2 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend byte to long
INST3(pmovsxbw, "vpmovsxbw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x20), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_8Bit | KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend byte to short
INST3(pmovsxdq, "vpmovsxdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x25), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_32Bit | KMask_Base2 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed sign extend int to long
INST3(pmovsxwd, "vpmovsxwd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x23), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_16Bit | KMask_Base4 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend short to int
INST3(pmovsxwq, "vpmovsxwq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x24), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_16Bit | KMask_Base2 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed sign extend short to long
INST3(pmovzxbd, "vpmovzxbd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x31), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_8Bit | KMask_Base4 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend byte to intg
INST3(pmovzxbq, "vpmovzxbq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x32), ILLEGAL, ILLEGAL, INS_TT_EIGHTH_MEM, Input_8Bit | KMask_Base2 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend byte to lon
INST3(pmovzxbw, "vpmovzxbw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x30), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_8Bit | KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend byte to short
INST3(pmovzxdq, "vpmovzxdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x35), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_32Bit | KMask_Base2 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed zero extend int to long
INST3(pmovzxwd, "vpmovzxwd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x33), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_16Bit | KMask_Base4 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend short to int
INST3(pmovzxwq, "vpmovzxwq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x34), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_16Bit | KMask_Base2 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed zero extend short to long
INST3(pmuldq, "vpmuldq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x28), 5C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit signed integers and store 64-bit result
INST3(pmulhrsw, "vpmulhrsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0B), 5C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Multiply High with Round and Scale
INST3(pmulld, "vpmulld", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x40), 10C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 32 bit unsigned integers and store lower 32 bits of each result
INST3(pshufb, "vpshufb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x00), 1C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Shuffle Bytes
INST3(psignb, "vpsignb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x08), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN
INST3(psignd, "vpsignd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x0A), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN
INST3(psignw, "vpsignw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x09), 1C, 2X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed SIGN
INST3(ptest, "vptest", IUM_RD, BAD_CODE, BAD_CODE, SSE38(0x17), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF) // Packed logical compare
INST3(roundpd, "vroundpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x09), 8C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_FLAGS_HasPseudoName) // Round packed double precision floating-point values
INST3(roundps, "vroundps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x08), 8C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_FLAGS_HasPseudoName) // Round packed single precision floating-point values
INST3(roundsd, "vroundsd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0B), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Round scalar double precision floating-point values
INST3(roundss, "vroundss", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0A), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Round scalar single precision floating-point values
// Instructions for AESNI, PCLMULQDQ
INST3(aesdec, "vaesdec", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDE), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform one round of an AES decryption flow
INST3(aesdeclast, "vaesdeclast", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDF), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform last round of an AES decryption flow

View File

@ -394,7 +394,6 @@ RELEASE_CONFIG_INTEGER(EnableHWIntrinsic, "EnableHWIntrinsic",
#endif // defined(TARGET_LOONGARCH64)
#if defined(TARGET_AMD64) || defined(TARGET_X86)
RELEASE_CONFIG_INTEGER(EnableSSE42, "EnableSSE42", 1) // Allows SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, and dependent hardware intrinsics to be disabled
RELEASE_CONFIG_INTEGER(EnableAVX, "EnableAVX", 1) // Allows AVX and dependent hardware intrinsics to be disabled
RELEASE_CONFIG_INTEGER(EnableAVX2, "EnableAVX2", 1) // Allows AVX2, BMI1, BMI2, F16C, FMA, LZCNT, MOVBE and dependent hardware intrinsics to be disabled
RELEASE_CONFIG_INTEGER(EnableAVX512, "EnableAVX512", 1) // Allows AVX512 F+BW+CD+DQ+VL and depdendent hardware intrinsics to be disabled

File diff suppressed because it is too large Load Diff

View File

@ -4164,22 +4164,6 @@ int LinearScan::BuildStoreLoc(GenTreeLclVarCommon* storeLoc)
{
BuildUse(op1, RBM_NONE, i);
}
#if defined(FEATURE_SIMD) && defined(TARGET_X86)
if (TargetOS::IsWindows && !compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
if (varTypeIsSIMD(storeLoc) && op1->IsCall())
{
// Need an additional register to create a SIMD8 from EAX/EDX without SSE4.1.
buildInternalFloatRegisterDefForNode(storeLoc, allSIMDRegs());
if (isCandidateVar(varDsc))
{
// This internal register must be different from the target register.
setInternalRegsDelayFree = true;
}
}
}
#endif // FEATURE_SIMD && TARGET_X86
}
else if (op1->isContained() && op1->OperIs(GT_BITCAST))
{

View File

@ -1710,13 +1710,6 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk)
{
simdTemp = buildInternalFloatRegisterDefForNode(putArgStk);
}
if (!compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
// To store SIMD12 without extractps we will need
// a temp xmm reg to do the shuffle.
buildInternalFloatRegisterDefForNode(use.GetNode());
}
}
#endif // defined(FEATURE_SIMD)
@ -2270,16 +2263,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
RefPosition* op1Use = BuildUse(op1);
srcCount += 1;
if ((baseType == TYP_FLOAT) && HWIntrinsicInfo::IsVectorCreateScalar(intrinsicId) &&
!compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
setDelayFree(op1Use);
}
else
{
tgtPrefUse = op1Use;
}
tgtPrefUse = op1Use;
}
buildUses = false;
@ -2289,12 +2273,6 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
{
dstCandidates = allByteRegs();
}
else if (varTypeIsLong(baseType) && !compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
// For SSE2 fallbacks, we will need a temp register to insert the upper half of a long
buildInternalFloatRegisterDefForNode(intrinsicTree);
setInternalRegsDelayFree = true;
}
#endif // TARGET_X86
break;
}
@ -2396,7 +2374,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
break;
}
case NI_SSE42_BlendVariable:
case NI_X86Base_BlendVariable:
{
assert(numArgs == 3);
@ -2424,7 +2402,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
break;
}
case NI_SSE42_Extract:
case NI_X86Base_Extract:
{
assert(!varTypeIsFloating(baseType));
@ -2438,8 +2416,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
}
#ifdef TARGET_X86
case NI_SSE42_Crc32:
case NI_SSE42_X64_Crc32:
case NI_X86Base_Crc32:
case NI_X86Base_X64_Crc32:
{
// TODO-XArch-Cleanup: Currently we use the BaseType to bring the type of the second argument
// to the code generator. We may want to encode the overload info in another way.
@ -3107,15 +3085,6 @@ int LinearScan::BuildIndir(GenTreeIndir* indirTree)
assert(!indirTree->TypeIs(TYP_STRUCT));
SingleTypeRegSet useCandidates = RBM_NONE;
#ifdef FEATURE_SIMD
if (indirTree->TypeIs(TYP_SIMD12) && indirTree->OperIs(GT_STOREIND) &&
!compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42) && !indirTree->Data()->IsVectorZero())
{
// GT_STOREIND needs an internal register so the upper 4 bytes can be extracted
buildInternalFloatRegisterDefForNode(indirTree);
}
#endif // FEATURE_SIMD
#ifdef TARGET_AMD64
if (varTypeUsesIntReg(indirTree->Addr()))
{

View File

@ -421,14 +421,9 @@ void Rationalizer::RewriteHWIntrinsicAsUserCall(GenTree** use, ArrayStack<GenTre
#if defined(TARGET_XARCH)
case NI_Vector128_ExtractMostSignificantBits:
{
// We want to keep this as is, because we'll rewrite it in post-order
assert(varTypeIsShort(simdBaseType));
if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
// We want to keep this as is, because we'll rewrite it in post-order
return;
}
break;
return;
}
#endif // TARGET_XARCH
@ -698,7 +693,7 @@ void Rationalizer::RewriteHWIntrinsicBlendv(GenTree** use, Compiler::GenTreeStac
}
else
{
intrinsic = NI_SSE42_BlendVariable;
intrinsic = NI_X86Base_BlendVariable;
}
if (HWIntrinsicInfo::NeedsNormalizeSmallTypeToInt(intrinsic) && varTypeIsSmall(simdBaseType))
@ -917,10 +912,6 @@ void Rationalizer::RewriteHWIntrinsicToNonMask(GenTree** use, Compiler::GenTreeS
intrinsic = NI_AVX_CompareEqual;
}
}
else if (varTypeIsLong(simdBaseType))
{
intrinsic = NI_SSE42_CompareEqual;
}
else
{
intrinsic = NI_X86Base_CompareEqual;
@ -941,10 +932,6 @@ void Rationalizer::RewriteHWIntrinsicToNonMask(GenTree** use, Compiler::GenTreeS
intrinsic = NI_AVX_CompareGreaterThan;
}
}
else if (varTypeIsLong(simdBaseType))
{
intrinsic = NI_SSE42_CompareGreaterThan;
}
else
{
intrinsic = NI_X86Base_CompareGreaterThan;
@ -978,10 +965,6 @@ void Rationalizer::RewriteHWIntrinsicToNonMask(GenTree** use, Compiler::GenTreeS
intrinsic = NI_AVX_CompareLessThan;
}
}
else if (varTypeIsLong(simdBaseType))
{
intrinsic = NI_SSE42_CompareLessThan;
}
else
{
intrinsic = NI_X86Base_CompareLessThan;
@ -1539,9 +1522,6 @@ void Rationalizer::RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTree
parents.Push(castNode);
}
#elif defined(TARGET_XARCH)
NamedIntrinsic moveMaskIntrinsic = NI_Illegal;
NamedIntrinsic shuffleIntrinsic = NI_Illegal;
simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE;
// We want to tightly pack the most significant byte of each short/ushort
@ -1554,6 +1534,8 @@ void Rationalizer::RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTree
simdVal.u64[0] = 0x0F0D0B0907050301;
simdVal.u64[1] = 0x8080808080808080;
NamedIntrinsic shuffleIntrinsic = NI_Illegal;
if (simdSize == 32)
{
// Vector256 works on 2x128-bit lanes, so repeat the same indices for the upper lane
@ -1561,15 +1543,11 @@ void Rationalizer::RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTree
simdVal.u64[2] = 0x0F0D0B0907050301;
simdVal.u64[3] = 0x8080808080808080;
shuffleIntrinsic = NI_AVX2_Shuffle;
moveMaskIntrinsic = NI_X86Base_MoveMask;
shuffleIntrinsic = NI_AVX2_Shuffle;
}
else
{
assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE42));
shuffleIntrinsic = NI_SSE42_Shuffle;
moveMaskIntrinsic = NI_X86Base_MoveMask;
shuffleIntrinsic = NI_X86Base_Shuffle;
}
GenTree* op2 = comp->gtNewVconNode(simdType);
@ -1606,7 +1584,7 @@ void Rationalizer::RewriteHWIntrinsicExtractMsb(GenTree** use, Compiler::GenTree
simdSize = 16;
}
node->ChangeHWIntrinsicId(moveMaskIntrinsic);
node->ChangeHWIntrinsicId(NI_X86Base_MoveMask);
node->SetSimdSize(simdSize);
node->SetSimdBaseJitType(simdBaseJitType);
node->Op(1) = op1;

View File

@ -88,23 +88,12 @@ void CodeGen::genStoreIndTypeSimd12(GenTreeStoreInd* treeNode)
// Store upper 4 bytes
emit->emitInsStoreInd(INS_movss, EA_4BYTE, treeNode);
}
else if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42))
else
{
// Extract and store upper 4 bytes
GenTreeStoreInd storeInd = storeIndirForm(TYP_SIMD16, addr, data);
emit->emitIns_A_R_I(INS_extractps, EA_16BYTE, &storeInd, dataReg, 2);
}
else
{
regNumber tmpReg = internalRegisters.GetSingle(treeNode);
// Extract upper 4 bytes from data
emit->emitIns_R_R(INS_movhlps, EA_16BYTE, tmpReg, dataReg);
data->SetRegNum(tmpReg);
// Store upper 4 bytes
emit->emitInsStoreInd(INS_movss, EA_4BYTE, treeNode);
}
}
//-----------------------------------------------------------------------------
@ -133,15 +122,11 @@ void CodeGen::genLoadIndTypeSimd12(GenTreeIndir* treeNode)
return;
}
emitter* emit = GetEmitter();
regNumber tgtReg = treeNode->GetRegNum();
bool useSse42 = compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42);
emitter* emit = GetEmitter();
regNumber tgtReg = treeNode->GetRegNum();
if (useSse42)
{
// Load lower 8 bytes
emit->emitInsLoadInd(INS_movsd_simd, EA_8BYTE, tgtReg, treeNode);
}
// Load lower 8 bytes
emit->emitInsLoadInd(INS_movsd_simd, EA_8BYTE, tgtReg, treeNode);
// Update the addr node to offset by 8
@ -164,41 +149,9 @@ void CodeGen::genLoadIndTypeSimd12(GenTreeIndir* treeNode)
treeNode->Addr() = addr;
if (useSse42)
{
// Load and insert upper 4 bytes, 0x20 inserts to index 2 and 0x8 zeros index 3
GenTreeIndir indir = indirForm(TYP_SIMD16, addr);
emit->emitIns_SIMD_R_R_A_I(INS_insertps, EA_16BYTE, tgtReg, tgtReg, &indir, 0x28, INS_OPTS_NONE);
}
else
{
// Load upper 4 bytes to lower half of tgtReg
emit->emitInsLoadInd(INS_movss, EA_4BYTE, tgtReg, treeNode);
// Move upper 4 bytes to upper half of tgtReg
emit->emitIns_R_R(INS_movlhps, EA_16BYTE, tgtReg, tgtReg);
// Revert the addr node to the original offset
// Doing it this way saves us a register and produces smaller code
if (treeNode->isIndirAddrMode())
{
GenTreeAddrMode* addrMode = addr->AsAddrMode();
addrMode->SetOffset(addrMode->Offset() - 8);
}
else if (addr->IsCnsIntOrI() && addr->isContained())
{
GenTreeIntConCommon* icon = addr->AsIntConCommon();
icon->SetIconValue(icon->IconValue() - 8);
}
else
{
unreached();
}
// Load lower 8 bytes into tgtReg, preserving upper 4 bytes
emit->emitInsLoadInd(INS_movlps, EA_16BYTE, tgtReg, treeNode);
}
// Load and insert upper 4 bytes, 0x20 inserts to index 2 and 0x8 zeros index 3
GenTreeIndir indir = indirForm(TYP_SIMD16, addr);
emit->emitIns_SIMD_R_R_A_I(INS_insertps, EA_16BYTE, tgtReg, tgtReg, &indir, 0x28, INS_OPTS_NONE);
genProduceReg(treeNode);
}
@ -288,21 +241,11 @@ void CodeGen::genEmitStoreLclTypeSimd12(GenTree* store, unsigned lclNum, unsigne
// Store upper 4 bytes
emit->emitIns_S_R(INS_movss, EA_4BYTE, dataReg, lclNum, offset + 8);
}
else if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42))
else
{
// Extract and store upper 4 bytes
emit->emitIns_S_R_I(INS_extractps, EA_16BYTE, lclNum, offset + 8, dataReg, 2);
}
else
{
regNumber tmpReg = internalRegisters.GetSingle(store);
// Extract upper 4 bytes from data
emit->emitIns_R_R(INS_movhlps, EA_16BYTE, tmpReg, dataReg);
// Store upper 4 bytes
emit->emitIns_S_R(INS_movss, EA_4BYTE, tmpReg, lclNum, offset + 8);
}
}
//------------------------------------------------------------------------
@ -317,25 +260,11 @@ void CodeGen::genEmitLoadLclTypeSimd12(regNumber tgtReg, unsigned lclNum, unsign
{
emitter* emit = GetEmitter();
if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
// Load lower 8 bytes into tgtReg, preserving upper 4 bytes
emit->emitIns_R_S(INS_movsd_simd, EA_8BYTE, tgtReg, lclNum, offset);
// Load lower 8 bytes into tgtReg, preserving upper 4 bytes
emit->emitIns_R_S(INS_movsd_simd, EA_8BYTE, tgtReg, lclNum, offset);
// Load and insert upper 4 byte, 0x20 inserts to index 2 and 0x8 zeros index 3
emit->emitIns_SIMD_R_R_S_I(INS_insertps, EA_16BYTE, tgtReg, tgtReg, lclNum, offset + 8, 0x28, INS_OPTS_NONE);
}
else
{
// Load upper 4 bytes to lower half of tgtReg
emit->emitIns_R_S(INS_movss, EA_4BYTE, tgtReg, lclNum, offset + 8);
// Move upper 4 bytes to upper half of tgtReg
emit->emitIns_R_R(INS_movlhps, EA_16BYTE, tgtReg, tgtReg);
// Load lower 8 bytes into tgtReg, preserving upper 4 bytes
emit->emitIns_R_S(INS_movlps, EA_16BYTE, tgtReg, lclNum, offset);
}
// Load and insert upper 4 byte, 0x20 inserts to index 2 and 0x8 zeros index 3
emit->emitIns_SIMD_R_R_S_I(INS_insertps, EA_16BYTE, tgtReg, tgtReg, lclNum, offset + 8, 0x28, INS_OPTS_NONE);
}
#ifdef TARGET_X86
@ -524,26 +453,12 @@ void CodeGen::genSimd12UpperClear(regNumber tgtReg)
{
assert(genIsValidFloatReg(tgtReg));
if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
// ZMASK: 0b1000 - Preserve element 0, 1, and 2; Zero element 3
// COUNT_D: 0b11 - Insert into element 3
// COUNT_S: 0b11 - Insert from element 3
// ZMASK: 0b1000 - Preserve element 0, 1, and 2; Zero element 3
// COUNT_D: 0b11 - Insert into element 3
// COUNT_S: 0b11 - Insert from element 3
GetEmitter()->emitIns_SIMD_R_R_R_I(INS_insertps, EA_16BYTE, tgtReg, tgtReg, tgtReg, static_cast<int8_t>(0xF8),
INS_OPTS_NONE);
}
else
{
// Preserve element 0, 1, and 2; Zero element 3
simd16_t constValue;
constValue.u32[0] = 0xFFFFFFFF;
constValue.u32[1] = 0xFFFFFFFF;
constValue.u32[2] = 0xFFFFFFFF;
constValue.u32[3] = 0x00000000;
CORINFO_FIELD_HANDLE zroSimd12Elm3 = GetEmitter()->emitSimd16Const(constValue);
GetEmitter()->emitIns_SIMD_R_R_C(INS_andps, EA_16BYTE, tgtReg, tgtReg, zroSimd12Elm3, 0, INS_OPTS_NONE);
}
GetEmitter()->emitIns_SIMD_R_R_R_I(INS_insertps, EA_16BYTE, tgtReg, tgtReg, tgtReg, static_cast<int8_t>(0xF8),
INS_OPTS_NONE);
}
#endif // FEATURE_SIMD

View File

@ -6702,8 +6702,8 @@ bool ValueNumStore::IsVNNeverNegative(ValueNum vn)
case VNF_MDArrLowerBound:
#ifdef FEATURE_HW_INTRINSICS
#ifdef TARGET_XARCH
case VNF_HWI_SSE42_PopCount:
case VNF_HWI_SSE42_X64_PopCount:
case VNF_HWI_X86Base_PopCount:
case VNF_HWI_X86Base_X64_PopCount:
case VNF_HWI_AVX2_LeadingZeroCount:
case VNF_HWI_AVX2_TrailingZeroCount:
case VNF_HWI_AVX2_X64_LeadingZeroCount:
@ -8084,7 +8084,7 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree,
return VNForLongCon(static_cast<int64_t>(result));
}
case NI_SSE42_PopCount:
case NI_X86Base_PopCount:
{
assert(!varTypeIsSmall(type) && !varTypeIsLong(type));
@ -8094,7 +8094,7 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunUnary(GenTreeHWIntrinsic* tree,
return VNForIntCon(static_cast<int32_t>(result));
}
case NI_SSE42_X64_PopCount:
case NI_X86Base_X64_PopCount:
{
assert(varTypeIsLong(type));

View File

@ -176,13 +176,32 @@ static bool InitDLL(HANDLE hPalInstance)
bool DetectCPUFeatures()
{
#if defined(HOST_X86) || defined(HOST_AMD64) || defined(HOST_ARM64)
g_cpuFeatures = minipal_getcpufeatures();
int cpuFeatures = minipal_getcpufeatures();
if ((g_cpuFeatures & g_requiredCpuFeatures) != g_requiredCpuFeatures)
if ((cpuFeatures & IntrinsicConstants_Invalid) != 0)
{
PalPrintFatalError("\nThe required instruction sets are not supported by the current CPU.\n");
#if defined(HOST_X86) || defined(HOST_AMD64)
PalPrintFatalError("\nThe current CPU is missing one or more of the following instruction sets: SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT\n");
#elif defined(HOST_ARM64) && (defined(HOST_WINDOWS) || defined(HOST_APPLE))
PalPrintFatalError("\nThe current CPU is missing one or more of the following instruction sets: AdvSimd, LSE\n");
#elif defined(HOST_ARM64)
PalPrintFatalError("\nThe current CPU is missing one or more of the following instruction sets: AdvSimd\n");
#else
PalPrintFatalError("\nThe current CPU is missing one or more of the baseline instruction sets.\n");
#endif
RhFailFast();
}
int missingCpuFeatures = g_requiredCpuFeatures & ~cpuFeatures;
if (missingCpuFeatures != 0)
{
PalPrintFatalError("\nThe current CPU is missing one or more of the required instruction sets.\n");
RhFailFast();
}
g_cpuFeatures = cpuFeatures;
#endif // HOST_X86|| HOST_AMD64 || HOST_ARM64
return true;

View File

@ -60,33 +60,28 @@ namespace ILCompiler
// Keep these enumerations in sync with cpufeatures.h in the minipal.
private static class XArchIntrinsicConstants
{
// SSE and SSE2 are baseline ISAs - they're always available
public const int Sse42 = (1 << 0);
public const int Avx = (1 << 1);
public const int Avx2 = (1 << 2);
public const int Avx512 = (1 << 3);
public const int Avx512v2 = (1 << 4);
public const int Avx512v3 = (1 << 5);
public const int Avx10v1 = (1 << 6);
public const int Avx10v2 = (1 << 7);
public const int Apx = (1 << 8);
public const int Aes = (1 << 9);
public const int Avx512Vp2intersect = (1 << 10);
public const int AvxIfma = (1 << 11);
public const int AvxVnni = (1 << 12);
// SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, and POPCNT are baseline ISAs - they're always available
public const int Avx = (1 << 0);
public const int Avx2 = (1 << 1);
public const int Avx512 = (1 << 2);
public const int Avx512v2 = (1 << 3);
public const int Avx512v3 = (1 << 4);
public const int Avx10v1 = (1 << 5);
public const int Avx10v2 = (1 << 6);
public const int Apx = (1 << 7);
public const int Aes = (1 << 8);
public const int Avx512Vp2intersect = (1 << 9);
public const int AvxIfma = (1 << 10);
public const int AvxVnni = (1 << 11);
public const int AvxVnniInt = (1 << 12);
public const int Gfni = (1 << 13);
public const int Sha = (1 << 14);
public const int Vaes = (1 << 15);
public const int WaitPkg = (1 << 16);
public const int X86Serialize = (1 << 17);
public const int AvxVnniInt = (1 << 18);
public static void AddToBuilder(InstructionSetSupportBuilder builder, int flags)
{
if ((flags & Sse42) != 0)
builder.AddSupportedInstructionSet("sse42");
if ((flags & Avx) != 0)
builder.AddSupportedInstructionSet("avx");
if ((flags & Avx2) != 0)
@ -144,15 +139,11 @@ namespace ILCompiler
public static int FromInstructionSet(InstructionSet instructionSet)
{
Debug.Assert(InstructionSet.X64_AES == InstructionSet.X86_AES);
Debug.Assert(InstructionSet.X64_SSE42 == InstructionSet.X86_SSE42);
Debug.Assert(InstructionSet.X64_AVX2 == InstructionSet.X86_AVX2);
return instructionSet switch
{
// Optional ISAs - only available via opt-in or opportunistic light-up
InstructionSet.X64_SSE42 => Sse42,
InstructionSet.X64_SSE42_X64 => Sse42,
InstructionSet.X64_AVX => Avx,
InstructionSet.X64_AVX_X64 => Avx,

View File

@ -17,25 +17,63 @@ namespace System.CommandLine
internal static partial class Helpers
{
public static InstructionSetSupport ConfigureInstructionSetSupport(string instructionSet, int maxVectorTBitWidth, bool isVectorTOptimistic, TargetArchitecture targetArchitecture, TargetOS targetOS,
string mustNotBeMessage, string invalidImplicationMessage, Logger logger, bool optimizingForSize = false)
string mustNotBeMessage, string invalidImplicationMessage, Logger logger, bool optimizingForSize, bool isReadyToRun)
{
InstructionSetSupportBuilder instructionSetSupportBuilder = new(targetArchitecture);
// Ready to run images are built with certain instruction set baselines
// Images are built with certain instruction set baselines
//
// For NativeAOT, this represents the minimum hardware required to run.
// Older hardware will not work
//
// For ReadyToRun, this represents the presumed majority hardware.
// Older hardware (down to the NAOT baseline) will still work, but may have more jitting on startup
if ((targetArchitecture == TargetArchitecture.X86) || (targetArchitecture == TargetArchitecture.X64))
{
instructionSetSupportBuilder.AddSupportedInstructionSet("base");
if (isReadyToRun)
{
// ReadyToRun can presume AVX2, BMI1, BMI2, F16C, FMA, LZCNT, and MOVBE
instructionSetSupportBuilder.AddSupportedInstructionSet("x86-64-v3");
}
else
{
// Otherwise, we require SSE4.2 and POPCNT
instructionSetSupportBuilder.AddSupportedInstructionSet("x86-64-v2");
}
}
else if (targetArchitecture == TargetArchitecture.ARM64)
{
if (targetOS == TargetOS.OSX)
{
// For osx-arm64 we know that apple-m1 is a baseline
// For osx-arm64 we know that apple-m1 is the baseline
instructionSetSupportBuilder.AddSupportedInstructionSet("apple-m1");
}
else if (isReadyToRun)
{
if (targetOS == TargetOS.Windows)
{
// ReadyToRun on Windows can presume armv8.2-a and RCPC
instructionSetSupportBuilder.AddSupportedInstructionSet("armv8.2-a");
instructionSetSupportBuilder.AddSupportedInstructionSet("rcpc");
}
else
{
// While Unix needs a lower baseline due to things like Raspberry PI
instructionSetSupportBuilder.AddSupportedInstructionSet("armv8-a");
instructionSetSupportBuilder.AddSupportedInstructionSet("lse");
}
}
else
{
instructionSetSupportBuilder.AddSupportedInstructionSet("neon");
// We require armv8-a everywhere
instructionSetSupportBuilder.AddSupportedInstructionSet("armv8-a");
if (targetOS == TargetOS.Windows)
{
// However, Windows also requires LSE
instructionSetSupportBuilder.AddSupportedInstructionSet("lse");
}
}
}
@ -187,7 +225,6 @@ namespace System.CommandLine
// Note that we do not indicate support for AVX, or any other instruction set which uses the VEX encodings as
// the presence of those makes otherwise acceptable code be unusable on hardware which does not support VEX encodings.
//
optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("sse42");
optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("aes");
optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("gfni");
optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("sha");
@ -234,11 +271,13 @@ namespace System.CommandLine
{
optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("aes");
optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("crc");
optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("dotprod");
optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("lse");
optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("rcpc");
optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("rcpc2");
optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("rdma");
optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("sha1");
optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("sha2");
optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("lse");
optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("dotprod");
optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("rdma");
}
// Vector<T> can always be part of the optimistic set, we only want to optionally exclude it from the supported set

View File

@ -73,8 +73,6 @@ namespace Internal.ReadyToRunConstants
{
case InstructionSet.X64_X86Base: return ReadyToRunInstructionSet.X86Base;
case InstructionSet.X64_X86Base_X64: return ReadyToRunInstructionSet.X86Base;
case InstructionSet.X64_SSE42: return ReadyToRunInstructionSet.Sse42;
case InstructionSet.X64_SSE42_X64: return ReadyToRunInstructionSet.Sse42;
case InstructionSet.X64_AVX: return ReadyToRunInstructionSet.Avx;
case InstructionSet.X64_AVX_X64: return ReadyToRunInstructionSet.Avx;
case InstructionSet.X64_AVX2: return ReadyToRunInstructionSet.Avx2;
@ -129,8 +127,6 @@ namespace Internal.ReadyToRunConstants
{
case InstructionSet.X86_X86Base: return ReadyToRunInstructionSet.X86Base;
case InstructionSet.X86_X86Base_X64: return null;
case InstructionSet.X86_SSE42: return ReadyToRunInstructionSet.Sse42;
case InstructionSet.X86_SSE42_X64: return null;
case InstructionSet.X86_AVX: return ReadyToRunInstructionSet.Avx;
case InstructionSet.X86_AVX_X64: return null;
case InstructionSet.X86_AVX2: return ReadyToRunInstructionSet.Avx2;

View File

@ -47,7 +47,6 @@ namespace Internal.JitInterface
RiscV64_Zba = InstructionSet_RiscV64.Zba,
RiscV64_Zbb = InstructionSet_RiscV64.Zbb,
X64_X86Base = InstructionSet_X64.X86Base,
X64_SSE42 = InstructionSet_X64.SSE42,
X64_AVX = InstructionSet_X64.AVX,
X64_AVX2 = InstructionSet_X64.AVX2,
X64_AVX512 = InstructionSet_X64.AVX512,
@ -77,7 +76,6 @@ namespace Internal.JitInterface
X64_AVXVNNIINT = InstructionSet_X64.AVXVNNIINT,
X64_AVXVNNIINT_V512 = InstructionSet_X64.AVXVNNIINT_V512,
X64_X86Base_X64 = InstructionSet_X64.X86Base_X64,
X64_SSE42_X64 = InstructionSet_X64.SSE42_X64,
X64_AVX_X64 = InstructionSet_X64.AVX_X64,
X64_AVX2_X64 = InstructionSet_X64.AVX2_X64,
X64_AVX512_X64 = InstructionSet_X64.AVX512_X64,
@ -94,7 +92,6 @@ namespace Internal.JitInterface
X64_WAITPKG_X64 = InstructionSet_X64.WAITPKG_X64,
X64_X86Serialize_X64 = InstructionSet_X64.X86Serialize_X64,
X86_X86Base = InstructionSet_X86.X86Base,
X86_SSE42 = InstructionSet_X86.SSE42,
X86_AVX = InstructionSet_X86.AVX,
X86_AVX2 = InstructionSet_X86.AVX2,
X86_AVX512 = InstructionSet_X86.AVX512,
@ -124,7 +121,6 @@ namespace Internal.JitInterface
X86_AVXVNNIINT = InstructionSet_X86.AVXVNNIINT,
X86_AVXVNNIINT_V512 = InstructionSet_X86.AVXVNNIINT_V512,
X86_X86Base_X64 = InstructionSet_X86.X86Base_X64,
X86_SSE42_X64 = InstructionSet_X86.SSE42_X64,
X86_AVX_X64 = InstructionSet_X86.AVX_X64,
X86_AVX2_X64 = InstructionSet_X86.AVX2_X64,
X86_AVX512_X64 = InstructionSet_X86.AVX512_X64,
@ -188,52 +184,50 @@ namespace Internal.JitInterface
ILLEGAL = InstructionSet.ILLEGAL,
NONE = InstructionSet.NONE,
X86Base = 1,
SSE42 = 2,
AVX = 3,
AVX2 = 4,
AVX512 = 5,
AVX512v2 = 6,
AVX512v3 = 7,
AVX10v1 = 8,
AVX10v2 = 9,
APX = 10,
AES = 11,
AES_V256 = 12,
AES_V512 = 13,
AVX512VP2INTERSECT = 14,
AVXIFMA = 15,
AVXVNNI = 16,
GFNI = 17,
GFNI_V256 = 18,
GFNI_V512 = 19,
SHA = 20,
WAITPKG = 21,
X86Serialize = 22,
Vector128 = 23,
Vector256 = 24,
Vector512 = 25,
VectorT128 = 26,
VectorT256 = 27,
VectorT512 = 28,
AVXVNNIINT = 29,
AVXVNNIINT_V512 = 30,
X86Base_X64 = 31,
SSE42_X64 = 32,
AVX_X64 = 33,
AVX2_X64 = 34,
AVX512_X64 = 35,
AVX512v2_X64 = 36,
AVX512v3_X64 = 37,
AVX10v1_X64 = 38,
AVX10v2_X64 = 39,
AES_X64 = 40,
AVX512VP2INTERSECT_X64 = 41,
AVXIFMA_X64 = 42,
AVXVNNI_X64 = 43,
GFNI_X64 = 44,
SHA_X64 = 45,
WAITPKG_X64 = 46,
X86Serialize_X64 = 47,
AVX = 2,
AVX2 = 3,
AVX512 = 4,
AVX512v2 = 5,
AVX512v3 = 6,
AVX10v1 = 7,
AVX10v2 = 8,
APX = 9,
AES = 10,
AES_V256 = 11,
AES_V512 = 12,
AVX512VP2INTERSECT = 13,
AVXIFMA = 14,
AVXVNNI = 15,
GFNI = 16,
GFNI_V256 = 17,
GFNI_V512 = 18,
SHA = 19,
WAITPKG = 20,
X86Serialize = 21,
Vector128 = 22,
Vector256 = 23,
Vector512 = 24,
VectorT128 = 25,
VectorT256 = 26,
VectorT512 = 27,
AVXVNNIINT = 28,
AVXVNNIINT_V512 = 29,
X86Base_X64 = 30,
AVX_X64 = 31,
AVX2_X64 = 32,
AVX512_X64 = 33,
AVX512v2_X64 = 34,
AVX512v3_X64 = 35,
AVX10v1_X64 = 36,
AVX10v2_X64 = 37,
AES_X64 = 38,
AVX512VP2INTERSECT_X64 = 39,
AVXIFMA_X64 = 40,
AVXVNNI_X64 = 41,
GFNI_X64 = 42,
SHA_X64 = 43,
WAITPKG_X64 = 44,
X86Serialize_X64 = 45,
}
public enum InstructionSet_X86
@ -241,52 +235,50 @@ namespace Internal.JitInterface
ILLEGAL = InstructionSet.ILLEGAL,
NONE = InstructionSet.NONE,
X86Base = 1,
SSE42 = 2,
AVX = 3,
AVX2 = 4,
AVX512 = 5,
AVX512v2 = 6,
AVX512v3 = 7,
AVX10v1 = 8,
AVX10v2 = 9,
APX = 10,
AES = 11,
AES_V256 = 12,
AES_V512 = 13,
AVX512VP2INTERSECT = 14,
AVXIFMA = 15,
AVXVNNI = 16,
GFNI = 17,
GFNI_V256 = 18,
GFNI_V512 = 19,
SHA = 20,
WAITPKG = 21,
X86Serialize = 22,
Vector128 = 23,
Vector256 = 24,
Vector512 = 25,
VectorT128 = 26,
VectorT256 = 27,
VectorT512 = 28,
AVXVNNIINT = 29,
AVXVNNIINT_V512 = 30,
X86Base_X64 = 31,
SSE42_X64 = 32,
AVX_X64 = 33,
AVX2_X64 = 34,
AVX512_X64 = 35,
AVX512v2_X64 = 36,
AVX512v3_X64 = 37,
AVX10v1_X64 = 38,
AVX10v2_X64 = 39,
AES_X64 = 40,
AVX512VP2INTERSECT_X64 = 41,
AVXIFMA_X64 = 42,
AVXVNNI_X64 = 43,
GFNI_X64 = 44,
SHA_X64 = 45,
WAITPKG_X64 = 46,
X86Serialize_X64 = 47,
AVX = 2,
AVX2 = 3,
AVX512 = 4,
AVX512v2 = 5,
AVX512v3 = 6,
AVX10v1 = 7,
AVX10v2 = 8,
APX = 9,
AES = 10,
AES_V256 = 11,
AES_V512 = 12,
AVX512VP2INTERSECT = 13,
AVXIFMA = 14,
AVXVNNI = 15,
GFNI = 16,
GFNI_V256 = 17,
GFNI_V512 = 18,
SHA = 19,
WAITPKG = 20,
X86Serialize = 21,
Vector128 = 22,
Vector256 = 23,
Vector512 = 24,
VectorT128 = 25,
VectorT256 = 26,
VectorT512 = 27,
AVXVNNIINT = 28,
AVXVNNIINT_V512 = 29,
X86Base_X64 = 30,
AVX_X64 = 31,
AVX2_X64 = 32,
AVX512_X64 = 33,
AVX512v2_X64 = 34,
AVX512v3_X64 = 35,
AVX10v1_X64 = 36,
AVX10v2_X64 = 37,
AES_X64 = 38,
AVX512VP2INTERSECT_X64 = 39,
AVXIFMA_X64 = 40,
AVXVNNI_X64 = 41,
GFNI_X64 = 42,
SHA_X64 = 43,
WAITPKG_X64 = 44,
X86Serialize_X64 = 45,
}
public unsafe struct InstructionSetFlags : IEnumerable<InstructionSet>
@ -525,10 +517,6 @@ namespace Internal.JitInterface
resultflags.AddInstructionSet(InstructionSet.X64_X86Base_X64);
if (resultflags.HasInstructionSet(InstructionSet.X64_X86Base_X64))
resultflags.AddInstructionSet(InstructionSet.X64_X86Base);
if (resultflags.HasInstructionSet(InstructionSet.X64_SSE42))
resultflags.AddInstructionSet(InstructionSet.X64_SSE42_X64);
if (resultflags.HasInstructionSet(InstructionSet.X64_SSE42_X64))
resultflags.AddInstructionSet(InstructionSet.X64_SSE42);
if (resultflags.HasInstructionSet(InstructionSet.X64_AVX))
resultflags.AddInstructionSet(InstructionSet.X64_AVX_X64);
if (resultflags.HasInstructionSet(InstructionSet.X64_AVX_X64))
@ -589,10 +577,8 @@ namespace Internal.JitInterface
resultflags.AddInstructionSet(InstructionSet.X64_X86Serialize_X64);
if (resultflags.HasInstructionSet(InstructionSet.X64_X86Serialize_X64))
resultflags.AddInstructionSet(InstructionSet.X64_X86Serialize);
if (resultflags.HasInstructionSet(InstructionSet.X64_SSE42))
resultflags.AddInstructionSet(InstructionSet.X64_X86Base);
if (resultflags.HasInstructionSet(InstructionSet.X64_AVX))
resultflags.AddInstructionSet(InstructionSet.X64_SSE42);
resultflags.AddInstructionSet(InstructionSet.X64_X86Base);
if (resultflags.HasInstructionSet(InstructionSet.X64_AVX2))
resultflags.AddInstructionSet(InstructionSet.X64_AVX);
if (resultflags.HasInstructionSet(InstructionSet.X64_AVX512))
@ -622,7 +608,7 @@ namespace Internal.JitInterface
if (resultflags.HasInstructionSet(InstructionSet.X64_AVXVNNI))
resultflags.AddInstructionSet(InstructionSet.X64_AVX2);
if (resultflags.HasInstructionSet(InstructionSet.X64_GFNI))
resultflags.AddInstructionSet(InstructionSet.X64_SSE42);
resultflags.AddInstructionSet(InstructionSet.X64_X86Base);
if (resultflags.HasInstructionSet(InstructionSet.X64_GFNI_V256))
resultflags.AddInstructionSet(InstructionSet.X64_GFNI);
if (resultflags.HasInstructionSet(InstructionSet.X64_GFNI_V256))
@ -656,10 +642,8 @@ namespace Internal.JitInterface
break;
case TargetArchitecture.X86:
if (resultflags.HasInstructionSet(InstructionSet.X86_SSE42))
resultflags.AddInstructionSet(InstructionSet.X86_X86Base);
if (resultflags.HasInstructionSet(InstructionSet.X86_AVX))
resultflags.AddInstructionSet(InstructionSet.X86_SSE42);
resultflags.AddInstructionSet(InstructionSet.X86_X86Base);
if (resultflags.HasInstructionSet(InstructionSet.X86_AVX2))
resultflags.AddInstructionSet(InstructionSet.X86_AVX);
if (resultflags.HasInstructionSet(InstructionSet.X86_AVX512))
@ -689,7 +673,7 @@ namespace Internal.JitInterface
if (resultflags.HasInstructionSet(InstructionSet.X86_AVXVNNI))
resultflags.AddInstructionSet(InstructionSet.X86_AVX2);
if (resultflags.HasInstructionSet(InstructionSet.X86_GFNI))
resultflags.AddInstructionSet(InstructionSet.X86_SSE42);
resultflags.AddInstructionSet(InstructionSet.X86_X86Base);
if (resultflags.HasInstructionSet(InstructionSet.X86_GFNI_V256))
resultflags.AddInstructionSet(InstructionSet.X86_GFNI);
if (resultflags.HasInstructionSet(InstructionSet.X86_GFNI_V256))
@ -799,8 +783,6 @@ namespace Internal.JitInterface
case TargetArchitecture.X64:
if (resultflags.HasInstructionSet(InstructionSet.X64_X86Base_X64))
resultflags.AddInstructionSet(InstructionSet.X64_X86Base);
if (resultflags.HasInstructionSet(InstructionSet.X64_SSE42_X64))
resultflags.AddInstructionSet(InstructionSet.X64_SSE42);
if (resultflags.HasInstructionSet(InstructionSet.X64_AVX_X64))
resultflags.AddInstructionSet(InstructionSet.X64_AVX);
if (resultflags.HasInstructionSet(InstructionSet.X64_AVX2_X64))
@ -832,8 +814,6 @@ namespace Internal.JitInterface
if (resultflags.HasInstructionSet(InstructionSet.X64_X86Serialize_X64))
resultflags.AddInstructionSet(InstructionSet.X64_X86Serialize);
if (resultflags.HasInstructionSet(InstructionSet.X64_X86Base))
resultflags.AddInstructionSet(InstructionSet.X64_SSE42);
if (resultflags.HasInstructionSet(InstructionSet.X64_SSE42))
resultflags.AddInstructionSet(InstructionSet.X64_AVX);
if (resultflags.HasInstructionSet(InstructionSet.X64_AVX))
resultflags.AddInstructionSet(InstructionSet.X64_AVX2);
@ -863,7 +843,7 @@ namespace Internal.JitInterface
resultflags.AddInstructionSet(InstructionSet.X64_AVXIFMA);
if (resultflags.HasInstructionSet(InstructionSet.X64_AVX2))
resultflags.AddInstructionSet(InstructionSet.X64_AVXVNNI);
if (resultflags.HasInstructionSet(InstructionSet.X64_SSE42))
if (resultflags.HasInstructionSet(InstructionSet.X64_X86Base))
resultflags.AddInstructionSet(InstructionSet.X64_GFNI);
if (resultflags.HasInstructionSet(InstructionSet.X64_GFNI))
resultflags.AddInstructionSet(InstructionSet.X64_GFNI_V256);
@ -899,8 +879,6 @@ namespace Internal.JitInterface
case TargetArchitecture.X86:
if (resultflags.HasInstructionSet(InstructionSet.X86_X86Base))
resultflags.AddInstructionSet(InstructionSet.X86_SSE42);
if (resultflags.HasInstructionSet(InstructionSet.X86_SSE42))
resultflags.AddInstructionSet(InstructionSet.X86_AVX);
if (resultflags.HasInstructionSet(InstructionSet.X86_AVX))
resultflags.AddInstructionSet(InstructionSet.X86_AVX2);
@ -930,7 +908,7 @@ namespace Internal.JitInterface
resultflags.AddInstructionSet(InstructionSet.X86_AVXIFMA);
if (resultflags.HasInstructionSet(InstructionSet.X86_AVX2))
resultflags.AddInstructionSet(InstructionSet.X86_AVXVNNI);
if (resultflags.HasInstructionSet(InstructionSet.X86_SSE42))
if (resultflags.HasInstructionSet(InstructionSet.X86_X86Base))
resultflags.AddInstructionSet(InstructionSet.X86_GFNI);
if (resultflags.HasInstructionSet(InstructionSet.X86_GFNI))
resultflags.AddInstructionSet(InstructionSet.X86_GFNI_V256);
@ -971,10 +949,8 @@ namespace Internal.JitInterface
private static Dictionary<(string, TargetArchitecture), string> AllInstructionSetGroups { get; } = new()
{
{ ("x86-64", TargetArchitecture.X64), "base" },
{ ("x86-64", TargetArchitecture.X86), "base" },
{ ("x86-64-v2", TargetArchitecture.X64), "x86-64 sse4.2" },
{ ("x86-64-v2", TargetArchitecture.X86), "x86-64 sse4.2" },
{ ("x86-64-v2", TargetArchitecture.X64), "base" },
{ ("x86-64-v2", TargetArchitecture.X86), "base" },
{ ("x86-64-v3", TargetArchitecture.X64), "x86-64-v2 avx2" },
{ ("x86-64-v3", TargetArchitecture.X86), "x86-64-v2 avx2" },
{ ("x86-64-v4", TargetArchitecture.X64), "x86-64-v3 avx512" },
@ -1046,11 +1022,11 @@ namespace Internal.JitInterface
yield return new InstructionSetInfo("base", "X86Base", InstructionSet.X64_X86Base, true);
yield return new InstructionSetInfo("base", "Sse", InstructionSet.X64_X86Base, true);
yield return new InstructionSetInfo("base", "Sse2", InstructionSet.X64_X86Base, true);
yield return new InstructionSetInfo("sse4.2", "Sse42", InstructionSet.X64_SSE42, true);
yield return new InstructionSetInfo("sse4.2", "Sse3", InstructionSet.X64_SSE42, true);
yield return new InstructionSetInfo("sse4.2", "Ssse3", InstructionSet.X64_SSE42, true);
yield return new InstructionSetInfo("sse4.2", "Sse41", InstructionSet.X64_SSE42, true);
yield return new InstructionSetInfo("sse4.2", "Popcnt", InstructionSet.X64_SSE42, true);
yield return new InstructionSetInfo("base", "Sse42", InstructionSet.X64_X86Base, true);
yield return new InstructionSetInfo("base", "Sse3", InstructionSet.X64_X86Base, true);
yield return new InstructionSetInfo("base", "Ssse3", InstructionSet.X64_X86Base, true);
yield return new InstructionSetInfo("base", "Sse41", InstructionSet.X64_X86Base, true);
yield return new InstructionSetInfo("base", "Popcnt", InstructionSet.X64_X86Base, true);
yield return new InstructionSetInfo("avx", "Avx", InstructionSet.X64_AVX, true);
yield return new InstructionSetInfo("avx2", "Avx2", InstructionSet.X64_AVX2, true);
yield return new InstructionSetInfo("avx2", "Bmi1", InstructionSet.X64_AVX2, true);
@ -1119,11 +1095,11 @@ namespace Internal.JitInterface
yield return new InstructionSetInfo("base", "X86Base", InstructionSet.X86_X86Base, true);
yield return new InstructionSetInfo("base", "Sse", InstructionSet.X86_X86Base, true);
yield return new InstructionSetInfo("base", "Sse2", InstructionSet.X86_X86Base, true);
yield return new InstructionSetInfo("sse4.2", "Sse42", InstructionSet.X86_SSE42, true);
yield return new InstructionSetInfo("sse4.2", "Sse3", InstructionSet.X86_SSE42, true);
yield return new InstructionSetInfo("sse4.2", "Ssse3", InstructionSet.X86_SSE42, true);
yield return new InstructionSetInfo("sse4.2", "Sse41", InstructionSet.X86_SSE42, true);
yield return new InstructionSetInfo("sse4.2", "Popcnt", InstructionSet.X86_SSE42, true);
yield return new InstructionSetInfo("base", "Sse42", InstructionSet.X86_X86Base, true);
yield return new InstructionSetInfo("base", "Sse3", InstructionSet.X86_X86Base, true);
yield return new InstructionSetInfo("base", "Ssse3", InstructionSet.X86_X86Base, true);
yield return new InstructionSetInfo("base", "Sse41", InstructionSet.X86_X86Base, true);
yield return new InstructionSetInfo("base", "Popcnt", InstructionSet.X86_X86Base, true);
yield return new InstructionSetInfo("avx", "Avx", InstructionSet.X86_AVX, true);
yield return new InstructionSetInfo("avx2", "Avx2", InstructionSet.X86_AVX2, true);
yield return new InstructionSetInfo("avx2", "Bmi1", InstructionSet.X86_AVX2, true);
@ -1224,8 +1200,6 @@ namespace Internal.JitInterface
case TargetArchitecture.X64:
if (HasInstructionSet(InstructionSet.X64_X86Base))
AddInstructionSet(InstructionSet.X64_X86Base_X64);
if (HasInstructionSet(InstructionSet.X64_SSE42))
AddInstructionSet(InstructionSet.X64_SSE42_X64);
if (HasInstructionSet(InstructionSet.X64_AVX))
AddInstructionSet(InstructionSet.X64_AVX_X64);
if (HasInstructionSet(InstructionSet.X64_AVX2))
@ -1286,7 +1260,6 @@ namespace Internal.JitInterface
case TargetArchitecture.X64:
AddInstructionSet(InstructionSet.X64_X86Base_X64);
AddInstructionSet(InstructionSet.X64_SSE42_X64);
AddInstructionSet(InstructionSet.X64_AVX_X64);
AddInstructionSet(InstructionSet.X64_AVX2_X64);
AddInstructionSet(InstructionSet.X64_AVX512_X64);
@ -1306,7 +1279,6 @@ namespace Internal.JitInterface
case TargetArchitecture.X86:
AddInstructionSet(InstructionSet.X86_X86Base_X64);
AddInstructionSet(InstructionSet.X86_SSE42_X64);
AddInstructionSet(InstructionSet.X86_AVX_X64);
AddInstructionSet(InstructionSet.X86_AVX2_X64);
AddInstructionSet(InstructionSet.X86_AVX512_X64);
@ -1479,33 +1451,33 @@ namespace Internal.JitInterface
case "Sse42":
if (nestedTypeName == "X64")
{ return InstructionSet.X64_SSE42_X64; }
{ return InstructionSet.X64_X86Base_X64; }
else
{ return InstructionSet.X64_SSE42; }
{ return InstructionSet.X64_X86Base; }
case "Sse3":
if (nestedTypeName == "X64")
{ return InstructionSet.X64_SSE42_X64; }
{ return InstructionSet.X64_X86Base_X64; }
else
{ return InstructionSet.X64_SSE42; }
{ return InstructionSet.X64_X86Base; }
case "Ssse3":
if (nestedTypeName == "X64")
{ return InstructionSet.X64_SSE42_X64; }
{ return InstructionSet.X64_X86Base_X64; }
else
{ return InstructionSet.X64_SSE42; }
{ return InstructionSet.X64_X86Base; }
case "Sse41":
if (nestedTypeName == "X64")
{ return InstructionSet.X64_SSE42_X64; }
{ return InstructionSet.X64_X86Base_X64; }
else
{ return InstructionSet.X64_SSE42; }
{ return InstructionSet.X64_X86Base; }
case "Popcnt":
if (nestedTypeName == "X64")
{ return InstructionSet.X64_SSE42_X64; }
{ return InstructionSet.X64_X86Base_X64; }
else
{ return InstructionSet.X64_SSE42; }
{ return InstructionSet.X64_X86Base; }
case "Avx":
if (nestedTypeName == "X64")
@ -1800,19 +1772,19 @@ namespace Internal.JitInterface
{ return InstructionSet.X86_X86Base; }
case "Sse42":
{ return InstructionSet.X86_SSE42; }
{ return InstructionSet.X86_X86Base; }
case "Sse3":
{ return InstructionSet.X86_SSE42; }
{ return InstructionSet.X86_X86Base; }
case "Ssse3":
{ return InstructionSet.X86_SSE42; }
{ return InstructionSet.X86_X86Base; }
case "Sse41":
{ return InstructionSet.X86_SSE42; }
{ return InstructionSet.X86_X86Base; }
case "Popcnt":
{ return InstructionSet.X86_SSE42; }
{ return InstructionSet.X86_X86Base; }
case "Avx":
{ return InstructionSet.X86_AVX; }

View File

@ -32,11 +32,11 @@ instructionset ,X86 ,X86Base , ,22 ,X86Base
instructionset ,X86 ,Sse , ,1 ,X86Base ,base
instructionset ,X86 ,Sse2 , ,2 ,X86Base ,base
instructionset ,X86 ,Sse42 , ,6 ,SSE42 ,sse4.2
instructionset ,X86 ,Sse3 , ,3 ,SSE42 ,sse4.2
instructionset ,X86 ,Ssse3 , ,4 ,SSE42 ,sse4.2
instructionset ,X86 ,Sse41 , ,5 ,SSE42 ,sse4.2
instructionset ,X86 ,Popcnt , ,15 ,SSE42 ,sse4.2
instructionset ,X86 ,Sse42 , ,6 ,X86Base ,base
instructionset ,X86 ,Sse3 , ,3 ,X86Base ,base
instructionset ,X86 ,Ssse3 , ,4 ,X86Base ,base
instructionset ,X86 ,Sse41 , ,5 ,X86Base ,base
instructionset ,X86 ,Popcnt , ,15 ,X86Base ,base
instructionset ,X86 ,Avx , ,7 ,AVX ,avx
@ -122,7 +122,6 @@ instructionset ,X86 ,AvxVnniInt16_V512 , ,63 ,AVXVNNI
; 64-bit Instruction Sets
instructionset64bit,X86 ,X86Base
instructionset64bit,X86 ,SSE42
instructionset64bit,X86 ,AVX
instructionset64bit,X86 ,AVX2
@ -153,9 +152,7 @@ vectorinstructionset,X86 ,Vector512
; Implications
implication ,X86 ,SSE42 ,X86Base
implication ,X86 ,AVX ,SSE42
implication ,X86 ,AVX ,X86Base
implication ,X86 ,AVX2 ,AVX
implication ,X86 ,AVX512 ,AVX2
@ -175,7 +172,7 @@ implication ,X86 ,AVX512VP2INTERSECT ,AVX512
implication ,X86 ,AVXIFMA ,AVX2
implication ,X86 ,AVXVNNI ,AVX2
implication ,X86 ,GFNI ,SSE42
implication ,X86 ,GFNI ,X86Base
implication ,X86 ,GFNI_V256 ,GFNI
implication ,X86 ,GFNI_V256 ,AVX
implication ,X86 ,GFNI_V512 ,GFNI_V256
@ -264,8 +261,7 @@ implication ,RiscV64 ,Zbb ,RiscV64Base
implication ,RiscV64 ,Zba ,RiscV64Base
; ,name and aliases ,archs ,lower baselines included by implication
instructionsetgroup ,x86-64 ,X64 X86 ,base
instructionsetgroup ,x86-64-v2 ,X64 X86 ,x86-64 sse4.2
instructionsetgroup ,x86-64-v2 ,X64 X86 ,base
instructionsetgroup ,x86-64-v3 ,X64 X86 ,x86-64-v2 avx2
instructionsetgroup ,x86-64-v4 ,X64 X86 ,x86-64-v3 avx512

View File

@ -108,7 +108,8 @@ namespace ILCompiler
TargetOS targetOS = Get(_command.TargetOS);
InstructionSetSupport instructionSetSupport = Helpers.ConfigureInstructionSetSupport(Get(_command.InstructionSet), Get(_command.MaxVectorTBitWidth), isVectorTOptimistic, targetArchitecture, targetOS,
"Unrecognized instruction set {0}", "Unsupported combination of instruction sets: {0}/{1}", logger,
optimizingForSize: _command.OptimizationMode == OptimizationMode.PreferSize);
optimizingForSize: _command.OptimizationMode == OptimizationMode.PreferSize,
isReadyToRun: false);
string systemModuleName = Get(_command.SystemModuleName);
string reflectionData = Get(_command.ReflectionData);

View File

@ -86,7 +86,9 @@ namespace ILCompiler
TargetArchitecture targetArchitecture = Get(_command.TargetArchitecture);
TargetOS targetOS = Get(_command.TargetOS);
InstructionSetSupport instructionSetSupport = Helpers.ConfigureInstructionSetSupport(Get(_command.InstructionSet), Get(_command.MaxVectorTBitWidth), isVectorTOptimistic, targetArchitecture, targetOS,
SR.InstructionSetMustNotBe, SR.InstructionSetInvalidImplication, logger);
SR.InstructionSetMustNotBe, SR.InstructionSetInvalidImplication, logger,
optimizingForSize: _command.OptimizationMode == OptimizationMode.PreferSize,
isReadyToRun: true);
SharedGenericsMode genericsMode = SharedGenericsMode.CanonicalReferenceTypes;
var targetDetails = new TargetDetails(targetArchitecture, targetOS, Crossgen2RootCommand.IsArmel ? TargetAbi.NativeAotArmel : TargetAbi.NativeAot, instructionSetSupport.GetVectorTSimdVector());

View File

@ -1180,6 +1180,19 @@ void EEJitManager::SetCpuInfo()
int cpuFeatures = minipal_getcpufeatures();
if ((cpuFeatures & IntrinsicConstants_Invalid) != 0)
{
#if defined(TARGET_X86) || defined(TARGET_AMD64)
EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("\nThe current CPU is missing one or more of the following instruction sets: SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT\n"));
#elif defined(TARGET_ARM64) && (defined(TARGET_WINDOWS) || defined(TARGET_APPLE))
EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("\nThe current CPU is missing one or more of the following instruction sets: AdvSimd, LSE\n"));
#elif defined(TARGET_ARM64)
EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("\nThe current CPU is missing one or more of the following instruction sets: AdvSimd\n"));
#else
EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("\nThe current CPU is missing one or more of the baseline instruction sets.\n"));
#endif
}
// Get the maximum bitwidth of Vector<T>, rounding down to the nearest multiple of 128-bits
uint32_t maxVectorTBitWidth = (CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_MaxVectorTBitWidth) / 128) * 128;
@ -1198,20 +1211,13 @@ void EEJitManager::SetCpuInfo()
CPUCompileFlags.Set(InstructionSet_VectorT512);
}
// x86-64-v1
// x86-64-v2
if (CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableHWIntrinsic))
{
CPUCompileFlags.Set(InstructionSet_X86Base);
}
// x86-64-v2
if (((cpuFeatures & XArchIntrinsicConstants_Sse42) != 0) && CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableSSE42))
{
CPUCompileFlags.Set(InstructionSet_SSE42);
}
// x86-64-v3
if (((cpuFeatures & XArchIntrinsicConstants_Avx) != 0) && CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableAVX))
@ -6317,7 +6323,7 @@ size_t ReadyToRunJitManager::WalkILOffsets(
BoundsType boundsType,
void* pContext,
size_t (* pfnWalkILOffsets)(ICorDebugInfo::OffsetMapping *pOffsetMapping, void *pContext))
{
{
CONTRACTL {
THROWS; // on OOM.
GC_NOTRIGGER; // getting vars shouldn't trigger

View File

@ -238,8 +238,17 @@ int minipal_getcpufeatures(void)
bool hasAvx2Dependencies = false;
bool hasAvx10v1Dependencies = false;
assert((cpuidInfo[CPUID_EDX] & (1 << 25)) != 0); // SSE
assert((cpuidInfo[CPUID_EDX] & (1 << 26)) != 0); // SSE2
if (((cpuidInfo[CPUID_EDX] & (1 << 25)) == 0) || // SSE
((cpuidInfo[CPUID_EDX] & (1 << 26)) == 0) || // SSE2
((cpuidInfo[CPUID_ECX] & (1 << 0)) == 0) || // SSE3
((cpuidInfo[CPUID_ECX] & (1 << 9)) == 0) || // SSSE3
((cpuidInfo[CPUID_ECX] & (1 << 19)) == 0) || // SSE4.1
((cpuidInfo[CPUID_ECX] & (1 << 20)) == 0) || // SSE4.2
((cpuidInfo[CPUID_ECX] & (1 << 23)) == 0)) // POPCNT
{
// One of the baseline ISAs is not supported
result |= IntrinsicConstants_Invalid;
}
if (((cpuidInfo[CPUID_ECX] & (1 << 25)) != 0) && // AESNI
((cpuidInfo[CPUID_ECX] & (1 << 1)) != 0)) // PCLMULQDQ
@ -247,27 +256,18 @@ int minipal_getcpufeatures(void)
result |= XArchIntrinsicConstants_Aes;
}
if (((cpuidInfo[CPUID_ECX] & (1 << 0)) != 0) && // SSE3
((cpuidInfo[CPUID_ECX] & (1 << 9)) != 0) && // SSSE3
((cpuidInfo[CPUID_ECX] & (1 << 19)) != 0) && // SSE4.1
((cpuidInfo[CPUID_ECX] & (1 << 20)) != 0) && // SSE4.2
((cpuidInfo[CPUID_ECX] & (1 << 23)) != 0)) // POPCNT
if (((cpuidInfo[CPUID_ECX] & (1 << 27)) != 0) && // OSXSAVE
((cpuidInfo[CPUID_ECX] & (1 << 28)) != 0)) // AVX
{
result |= XArchIntrinsicConstants_Sse42;
if (((cpuidInfo[CPUID_ECX] & (1 << 27)) != 0) && // OSXSAVE
((cpuidInfo[CPUID_ECX] & (1 << 28)) != 0)) // AVX
if (IsAvxEnabled() && (xmmYmmStateSupport() == 1)) // XGETBV == 11
{
if (IsAvxEnabled() && (xmmYmmStateSupport() == 1)) // XGETBV == 11
{
result |= XArchIntrinsicConstants_Avx;
result |= XArchIntrinsicConstants_Avx;
if (((cpuidInfo[CPUID_ECX] & (1 << 29)) != 0) && // F16C
((cpuidInfo[CPUID_ECX] & (1 << 12)) != 0) && // FMA
((cpuidInfo[CPUID_ECX] & (1 << 22)) != 0)) // MOVBE
{
hasAvx2Dependencies = true;
}
if (((cpuidInfo[CPUID_ECX] & (1 << 29)) != 0) && // F16C
((cpuidInfo[CPUID_ECX] & (1 << 12)) != 0) && // FMA
((cpuidInfo[CPUID_ECX] & (1 << 22)) != 0)) // MOVBE
{
hasAvx2Dependencies = true;
}
}
}
@ -455,14 +455,18 @@ int minipal_getcpufeatures(void)
#if HAVE_AUXV_HWCAP_H
unsigned long hwCap = getauxval(AT_HWCAP);
assert(hwCap & HWCAP_ASIMD);
if ((hwCap & HWCAP_ASIMD) == 0)
{
// One of the baseline ISAs is not supported
result |= IntrinsicConstants_Invalid;
}
if ((hwCap & HWCAP_ATOMICS) != 0)
result |= ARM64IntrinsicConstants_Atomics;
if (hwCap & HWCAP_AES)
result |= ARM64IntrinsicConstants_Aes;
if (hwCap & HWCAP_ATOMICS)
result |= ARM64IntrinsicConstants_Atomics;
if (hwCap & HWCAP_CRC32)
result |= ARM64IntrinsicConstants_Crc32;
@ -498,6 +502,17 @@ int minipal_getcpufeatures(void)
int64_t valueFromSysctl = 0;
size_t sz = sizeof(valueFromSysctl);
if ((sysctlbyname("hw.optional.AdvSIMD", &valueFromSysctl, &sz, NULL, 0) != 0) || (valueFromSysctl == 0) ||
(sysctlbyname("hw.optional.arm.FEAT_LSE", &valueFromSysctl, &sz, NULL, 0) != 0) || (valueFromSysctl == 0))
{
// One of the baseline ISAs is not supported
result |= IntrinsicConstants_Invalid;
}
else
{
result |= ARM64IntrinsicConstants_Atomics;
}
if ((sysctlbyname("hw.optional.arm.FEAT_AES", &valueFromSysctl, &sz, NULL, 0) == 0) && (valueFromSysctl != 0))
result |= ARM64IntrinsicConstants_Aes;
@ -516,9 +531,6 @@ int minipal_getcpufeatures(void)
if ((sysctlbyname("hw.optional.arm.FEAT_SHA256", &valueFromSysctl, &sz, NULL, 0) == 0) && (valueFromSysctl != 0))
result |= ARM64IntrinsicConstants_Sha256;
if ((sysctlbyname("hw.optional.armv8_1_atomics", &valueFromSysctl, &sz, NULL, 0) == 0) && (valueFromSysctl != 0))
result |= ARM64IntrinsicConstants_Atomics;
if ((sysctlbyname("hw.optional.arm.FEAT_LRCPC", &valueFromSysctl, &sz, NULL, 0) == 0) && (valueFromSysctl != 0))
result |= ARM64IntrinsicConstants_Rcpc;
@ -529,6 +541,17 @@ int minipal_getcpufeatures(void)
#endif // HOST_UNIX
#if defined(HOST_WINDOWS)
if (!IsProcessorFeaturePresent(PF_ARM_V8_INSTRUCTIONS_AVAILABLE) ||
!IsProcessorFeaturePresent(PF_ARM_V81_ATOMIC_INSTRUCTIONS_AVAILABLE))
{
// One of the baseline ISAs is not supported
result |= IntrinsicConstants_Invalid;
}
else
{
result |= ARM64IntrinsicConstants_Atomics;
}
if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE))
{
result |= ARM64IntrinsicConstants_Aes;
@ -541,11 +564,6 @@ int minipal_getcpufeatures(void)
result |= ARM64IntrinsicConstants_Crc32;
}
if (IsProcessorFeaturePresent(PF_ARM_V81_ATOMIC_INSTRUCTIONS_AVAILABLE))
{
result |= ARM64IntrinsicConstants_Atomics;
}
if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE))
{
result |= ARM64IntrinsicConstants_Dp;
@ -578,7 +596,6 @@ int minipal_getcpufeatures(void)
{
result |= ARM64IntrinsicConstants_Sve2;
}
#endif // HOST_WINDOWS
#endif // HOST_ARM64

View File

@ -8,28 +8,28 @@
// Should match the constants defined in the compiler in HardwareIntrinsicHelpers.cs
//
// Reserve the last bit to indicate an invalid query, such as if a baseline ISA isn't supported
#define IntrinsicConstants_Invalid (1 << 31)
#if defined(HOST_X86) || defined(HOST_AMD64)
#define XArchIntrinsicConstants_Sse42 (1 << 0)
#define XArchIntrinsicConstants_Avx (1 << 1)
#define XArchIntrinsicConstants_Avx2 (1 << 2)
#define XArchIntrinsicConstants_Avx512 (1 << 3)
#define XArchIntrinsicConstants_Avx512v2 (1 << 4)
#define XArchIntrinsicConstants_Avx512v3 (1 << 5)
#define XArchIntrinsicConstants_Avx10v1 (1 << 6)
#define XArchIntrinsicConstants_Avx10v2 (1 << 7)
#define XArchIntrinsicConstants_Apx (1 << 8)
#define XArchIntrinsicConstants_Aes (1 << 9)
#define XArchIntrinsicConstants_Avx512Vp2intersect (1 << 10)
#define XArchIntrinsicConstants_AvxIfma (1 << 11)
#define XArchIntrinsicConstants_AvxVnni (1 << 12)
#define XArchIntrinsicConstants_Avx (1 << 0)
#define XArchIntrinsicConstants_Avx2 (1 << 1)
#define XArchIntrinsicConstants_Avx512 (1 << 2)
#define XArchIntrinsicConstants_Avx512v2 (1 << 3)
#define XArchIntrinsicConstants_Avx512v3 (1 << 4)
#define XArchIntrinsicConstants_Avx10v1 (1 << 5)
#define XArchIntrinsicConstants_Avx10v2 (1 << 6)
#define XArchIntrinsicConstants_Apx (1 << 7)
#define XArchIntrinsicConstants_Aes (1 << 8)
#define XArchIntrinsicConstants_Avx512Vp2intersect (1 << 9)
#define XArchIntrinsicConstants_AvxIfma (1 << 10)
#define XArchIntrinsicConstants_AvxVnni (1 << 11)
#define XArchIntrinsicConstants_AvxVnniInt (1 << 12)
#define XArchIntrinsicConstants_Gfni (1 << 13)
#define XArchIntrinsicConstants_Sha (1 << 14)
#define XArchIntrinsicConstants_Vaes (1 << 15)
#define XArchIntrinsicConstants_WaitPkg (1 << 16)
#define XArchIntrinsicConstants_X86Serialize (1 << 17)
#define XArchIntrinsicConstants_AvxVnniInt (1 << 18)
#endif // HOST_X86 || HOST_AMD64
#if defined(HOST_ARM64)
@ -50,7 +50,6 @@
// Bit position for the ARM64IntrinsicConstants_Atomics flags, to be used with tbz / tbnz instructions
#define ARM64_ATOMICS_FEATURE_FLAG_BIT 6
static_assert((1 << ARM64_ATOMICS_FEATURE_FLAG_BIT) == ARM64IntrinsicConstants_Atomics, "ARM64_ATOMICS_FEATURE_FLAG_BIT must match with ARM64IntrinsicConstants_Atomics");
#endif // HOST_ARM64
#if defined(HOST_RISCV64)

View File

@ -22,7 +22,6 @@
DOTNET_EnableAVX2;
DOTNET_EnableAVX512;
DOTNET_EnableHWIntrinsic;
DOTNET_EnableSSE42;
DOTNET_EnableAPX;
DOTNET_JitStressEvexEncoding;
DOTNET_PreferredVectorBitWidth;
@ -103,26 +102,23 @@
<TestEnvironment Include="jitstress2_tiered" JitStress="2" TieredCompilation="1" />
<TestEnvironment Include="jitstress_isas_nohwintrinsic" EnableHWIntrinsic="0" />
<TestEnvironment Include="jitstress_isas_x86_evex" JitStressEvexEncoding="1" PreferredVectorBitWidth="512" />
<TestEnvironment Include="jitstress_isas_x86_noavx" EnableAVX="0" /> <!-- Depends on SSE42 -->
<TestEnvironment Include="jitstress_isas_x86_noavx" EnableAVX="0" /> <!-- Depends on Baseline -->
<TestEnvironment Include="jitstress_isas_x86_noavx2" EnableAVX2="0" /> <!-- Depends on AVX -->
<TestEnvironment Include="jitstress_isas_x86_noavx512" EnableAVX512="0" /> <!-- Depends on AVX2 -->
<TestEnvironment Include="jitstress_isas_x86_nosse3" EnableSSE42="0" /> <!-- Depends on Baseline -->
<TestEnvironment Include="jitstress_isas_x86_vectort128" JitStressEvexEncoding="1" MaxVectorTBitWidth="128" />
<TestEnvironment Include="jitstress_isas_x86_vectort512" JitStressEvexEncoding="1" PreferredVectorBitWidth="512" MaxVectorTBitWidth="512" />
<TestEnvironment Include="jitstress_isas_x86_noavx512_vectort128" EnableAVX512="0" MaxVectorTBitWidth="128" />
<TestEnvironment Include="jitstress_isas_1_x86_evex" JitStress="1" JitStressEvexEncoding="1" PreferredVectorBitWidth="512" />
<TestEnvironment Include="jitstress_isas_1_x86_noavx" JitStress="1" EnableAVX="0" /> <!-- Depends on SSE42 -->
<TestEnvironment Include="jitstress_isas_1_x86_noavx" JitStress="1" EnableAVX="0" /> <!-- Depends on Baseline -->
<TestEnvironment Include="jitstress_isas_1_x86_noavx2" JitStress="1" EnableAVX2="0" /> <!-- Depends on AVX -->
<TestEnvironment Include="jitstress_isas_1_x86_noavx512" JitStress="1" EnableAVX512="0" /> <!-- Depends on AVX2 -->
<TestEnvironment Include="jitstress_isas_1_x86_nosse3" JitStress="1" EnableSSE42="0" /> <!-- Depends on Baseline -->
<TestEnvironment Include="jitstress_isas_1_x86_vectort128" JitStress="1" JitStressEvexEncoding="1" MaxVectorTBitWidth="128" />
<TestEnvironment Include="jitstress_isas_1_x86_vectort512" JitStress="1" JitStressEvexEncoding="1" PreferredVectorBitWidth="512" MaxVectorTBitWidth="512" />
<TestEnvironment Include="jitstress_isas_1_x86_noavx512_vectort128" JitStress="1" EnableAVX512="0" MaxVectorTBitWidth="128" />
<TestEnvironment Include="jitstress_isas_2_x86_evex" JitStress="2" JitStressEvexEncoding="1" PreferredVectorBitWidth="512" />
<TestEnvironment Include="jitstress_isas_2_x86_noavx" JitStress="2" EnableAVX="0" /> <!-- Depends on SSE42 -->
<TestEnvironment Include="jitstress_isas_2_x86_noavx" JitStress="2" EnableAVX="0" /> <!-- Depends on Baseline -->
<TestEnvironment Include="jitstress_isas_2_x86_noavx2" JitStress="2" EnableAVX2="0" /> <!-- Depends on AVX -->
<TestEnvironment Include="jitstress_isas_2_x86_noavx512" JitStress="2" EnableAVX512="0" /> <!-- Depends on AVX2 -->
<TestEnvironment Include="jitstress_isas_2_x86_nosse3" JitStress="2" EnableSSE42="0" /> <!-- Depends on Baseline -->
<TestEnvironment Include="jitstress_isas_2_x86_vectort128" JitStress="2" JitStressEvexEncoding="1" MaxVectorTBitWidth="128" />
<TestEnvironment Include="jitstress_isas_2_x86_vectort512" JitStress="2" JitStressEvexEncoding="1" PreferredVectorBitWidth="512" MaxVectorTBitWidth="512" />
<TestEnvironment Include="jitstress_isas_2_x86_noavx512_vectort128" JitStress="2" EnableAVX512="0" MaxVectorTBitWidth="128" />

View File

@ -70,7 +70,7 @@ namespace XarchHardwareIntrinsicTest._CpuId
for (int i = 0; i < 2; i++)
{
// SSE, SSE2 are paired
// SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT are paired
if (IsBitIncorrect(edx, 25, typeof(Sse), Sse.IsSupported, "HWIntrinsic", ref isHierarchyDisabled))
{
@ -81,6 +81,31 @@ namespace XarchHardwareIntrinsicTest._CpuId
{
testResult = Fail;
}
if (IsBitIncorrect(ecx, 0, typeof(Sse3), Sse3.IsSupported, "HWIntrinsic", ref isHierarchyDisabled))
{
testResult = Fail;
}
if (IsBitIncorrect(ecx, 9, typeof(Ssse3), Ssse3.IsSupported, "HWIntrinsic", ref isHierarchyDisabled))
{
testResult = Fail;
}
if (IsBitIncorrect(ecx, 19, typeof(Sse41), Sse41.IsSupported, "HWIntrinsic", ref isHierarchyDisabled))
{
testResult = Fail;
}
if (IsBitIncorrect(ecx, 20, typeof(Sse42), Sse42.IsSupported, "HWIntrinsic", ref isHierarchyDisabled))
{
testResult = Fail;
}
if (IsBitIncorrect(ecx, 23, typeof(Popcnt), Popcnt.IsSupported, "HWIntrinsic", ref isHierarchyDisabled))
{
testResult = Fail;
}
}
bool isBaselineHierarchyDisabled = isHierarchyDisabled;
@ -100,40 +125,6 @@ namespace XarchHardwareIntrinsicTest._CpuId
}
}
isHierarchyDisabled = isBaselineHierarchyDisabled;
for (int i = 0; i < 2; i++)
{
// SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT are paired
if (IsBitIncorrect(ecx, 0, typeof(Sse3), Sse3.IsSupported, "SSE42", ref isHierarchyDisabled))
{
testResult = Fail;
}
if (IsBitIncorrect(ecx, 9, typeof(Ssse3), Ssse3.IsSupported, "SSE42", ref isHierarchyDisabled))
{
testResult = Fail;
}
if (IsBitIncorrect(ecx, 19, typeof(Sse41), Sse41.IsSupported, "SSE42", ref isHierarchyDisabled))
{
testResult = Fail;
}
if (IsBitIncorrect(ecx, 20, typeof(Sse42), Sse42.IsSupported, "SSE42", ref isHierarchyDisabled))
{
testResult = Fail;
}
if (IsBitIncorrect(ecx, 23, typeof(Popcnt), Popcnt.IsSupported, "SSE42", ref isHierarchyDisabled))
{
testResult = Fail;
}
}
bool isSse42HierarchyDisabled = isHierarchyDisabled;
if (IsBitIncorrect(ecx, 28, typeof(Avx), Avx.IsSupported, "AVX", ref isHierarchyDisabled))
{
testResult = Fail;
@ -280,7 +271,7 @@ namespace XarchHardwareIntrinsicTest._CpuId
testResult = Fail;
}
isHierarchyDisabled = isSse42HierarchyDisabled;
isHierarchyDisabled = isBaselineHierarchyDisabled;
if (IsBitIncorrect(ecx, 8, typeof(Gfni), Gfni.IsSupported, "GFNI", ref isHierarchyDisabled))
{

View File

@ -48,50 +48,6 @@ unsafe class Program
bool? ExpectedSse2 = true;
#if BASELINE_INTRINSICS
bool? ExpectedSse3 = null;
bool? ExpectedSsse3 = null;
bool? ExpectedSse41 = null;
bool? ExpectedSse42 = null;
bool? ExpectedPopcnt = null;
bool? ExpectedAes = null;
bool? ExpectedPclmulqdq = null;
bool? ExpectedGfni = null;
bool? ExpectedSha = null;
bool? ExpectedWaitPkg = null;
bool? ExpectedX86Serialize = null;
bool? ExpectedAvx = false;
bool? ExpectedAvx2 = false;
bool? ExpectedBmi1 = false;
bool? ExpectedBmi2 = false;
bool? ExpectedF16c = false;
bool? ExpectedFma = false;
bool? ExpectedLzcnt = false;
bool? ExpectedAvx512F = false;
bool? ExpectedAvx512BW = false;
bool? ExpectedAvx512CD = false;
bool? ExpectedAvx512DQ = false;
bool? ExpectedAvx512Vbmi = false;
bool? ExpectedAvx512Bitalg = false;
bool? ExpectedAvx512Vbmi2 = false;
bool? ExpectedAvx512Vpopcntdq = false;
bool? ExpectedAvx512Bf16 = false;
bool? ExpectedAvx512Fp16 = false;
bool? ExpectedAvx10v1 = false;
bool? ExpectedAvx10v1V512 = false;
bool? ExpectedAvx10v2 = false;
bool? ExpectedAvx512Vp2intersect = false;
bool? ExpectedAvxIfma = false;
bool? ExpectedAvxVnni = false;
bool? ExpectedAvxVnniInt = false;
bool? ExpectedAvxVnniIntV512 = false;
bool? ExpectedGfniV256 = false;
bool? ExpectedGfniV512 = false;
bool? ExpectedAesV256 = false;
bool? ExpectedAesV512 = false;
bool? ExpectedPclmulqdqV256 = false;
bool? ExpectedPclmulqdqV512 = false;
#elif SSE42_INTRINSICS
bool? ExpectedSse3 = true;
bool? ExpectedSsse3 = true;
bool? ExpectedSse41 = true;

View File

@ -1,23 +0,0 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<CLRTestPriority>0</CLRTestPriority>
<CLRTestTargetUnsupported Condition="'$(TargetArchitecture)' != 'x64'">true</CLRTestTargetUnsupported>
<!-- Sanitizers increase the binary size, so it ends up outside of our expected range. -->
<CLRTestTargetUnsupported Condition="'$(EnableNativeSanitizers)' != ''">true</CLRTestTargetUnsupported>
<!-- Test infra issue on apple devices: https://github.com/dotnet/runtime/issues/89917 -->
<CLRTestTargetUnsupported Condition="'$(TargetsAppleMobile)' == 'true'">true</CLRTestTargetUnsupported>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
<DefineConstants>$(DefineConstants);SSE42_INTRINSICS;VECTORT128_INTRINSICS</DefineConstants>
<RequiresProcessIsolation>true</RequiresProcessIsolation>
<ReferenceXUnitWrapperGenerator>false</ReferenceXUnitWrapperGenerator>
</PropertyGroup>
<ItemGroup>
<IlcArg Include="--instruction-set:sse4.2" />
</ItemGroup>
<ItemGroup>
<Compile Include="Program.cs" />
</ItemGroup>
</Project>

View File

@ -89,11 +89,16 @@ class TestHardwareIntrinsics
public static bool IsAvxVnniSupported = AvxVnni.IsSupported;
}
class Complex
class Simple3
{
public static bool IsPopcntSupported = Popcnt.IsSupported;
}
class Complex
{
public static bool IsX86SerializeSupported = X86Serialize.IsSupported;
}
public static void Run()
{
Assert.IsPreinitialized(typeof(Simple1));
@ -102,11 +107,14 @@ class TestHardwareIntrinsics
Assert.IsPreinitialized(typeof(Simple2));
Assert.AreEqual(AvxVnni.IsSupported, Simple2.IsAvxVnniSupported);
Assert.IsPreinitialized(typeof(Simple3));
Assert.AreEqual(Popcnt.IsSupported, Simple3.IsPopcntSupported);
if (RuntimeInformation.ProcessArchitecture is Architecture.X86 or Architecture.X64)
Assert.IsLazyInitialized(typeof(Complex));
else
Assert.IsPreinitialized(typeof(Complex));
Assert.AreEqual(Popcnt.IsSupported, Complex.IsPopcntSupported);
Assert.AreEqual(X86Serialize.IsSupported, Complex.IsX86SerializeSupported);
}
}

View File

@ -1,22 +0,0 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<!-- Needed for CLRTestTargetUnsupported, IlasmRoundTripIncompatible, NativeAotIncompatible -->
<RequiresProcessIsolation>true</RequiresProcessIsolation>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
<CLRTestTargetUnsupported Condition="('$(TargetArchitecture)' != 'x64' AND '$(TargetArchitecture)' != 'x86') OR ('$(RuntimeFlavor)' != 'coreclr')">true</CLRTestTargetUnsupported>
</PropertyGroup>
<PropertyGroup>
<AlwaysUseCrossGen2>true</AlwaysUseCrossGen2>
<IlasmRoundTripIncompatible>true</IlasmRoundTripIncompatible>
<NativeAotIncompatible>true</NativeAotIncompatible>
</PropertyGroup>
<PropertyGroup>
<CrossGen2TestExtraArguments>$(CrossGen2TestExtraArguments) --instruction-set:sse4.2</CrossGen2TestExtraArguments>
</PropertyGroup>
<ItemGroup>
<Compile Include="../../../JIT/HardwareIntrinsics/X86/X86Base/CpuId.cs" />
</ItemGroup>
</Project>