From 8599161c16cbeb9ceee8a043038edff37400be5c Mon Sep 17 00:00:00 2001 From: anthonycanino Date: Sun, 12 Mar 2023 08:57:11 -0700 Subject: [PATCH] Avx512 extract most significant bits (#82731) * Add `TYP_MASK` and `Vector512.ExtractMostSignificantBits`. * Rebase / rename error fix. * Review edits. * Formatting. * Review edits. * Review cleanup. * Build fixes. * Address throughput issues pertaining to `availableRegCount`. * kmov RR refactor. * Split kmov into kmov_msk and kmov_gpr. * Fix thread. * Review edits. --- src/coreclr/jit/emit.cpp | 3 +- src/coreclr/jit/emitxarch.cpp | 112 +++++++++++++++++++- src/coreclr/jit/emitxarch.h | 6 ++ src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 48 +++++++++ src/coreclr/jit/hwintrinsiclistxarch.h | 4 + src/coreclr/jit/hwintrinsicxarch.cpp | 14 +++ src/coreclr/jit/instr.cpp | 3 +- src/coreclr/jit/instrsxarch.h | 21 ++++ src/coreclr/jit/lsra.cpp | 17 ++- src/coreclr/jit/lsra.h | 34 ++++-- src/coreclr/jit/lsrabuild.cpp | 10 ++ src/coreclr/jit/lsraxarch.cpp | 10 ++ src/coreclr/jit/register.h | 26 ++++- src/coreclr/jit/target.h | 7 +- src/coreclr/jit/targetamd64.h | 7 ++ src/coreclr/jit/targetx86.h | 9 ++ src/coreclr/jit/typelist.h | 3 +- src/coreclr/jit/vartype.h | 12 ++- 18 files changed, 323 insertions(+), 23 deletions(-) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 7f3ad3e38c32b7..f8231297062a7a 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -8293,12 +8293,13 @@ void emitter::emitDispDataSec(dataSecDsc* section, BYTE* dst) i += j; break; + case 64: case 32: case 16: case 8: assert((data->dsSize % 8) == 0); printf("\tdq\t%016llXh", *reinterpret_cast(&data->dsCont[i])); - for (j = 8; j < 32; j += 8) + for (j = 8; j < 64; j += 8) { if (i + j >= data->dsSize) break; diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index bdd03f7fd8de82..4d31097b5ad4a7 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -34,8 +34,15 @@ bool emitter::IsSSEOrAVXInstruction(instruction ins) return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION); } +bool emitter::IsKInstruction(instruction ins) +{ + return (ins >= INS_FIRST_K_INSTRUCTION) && (ins <= INS_LAST_K_INSTRUCTION); +} + //------------------------------------------------------------------------ -// IsAvx512OrPriorInstruction: Is this an Avx512 or Avx or Sse instruction. +// IsAvx512OrPriorInstruction: Is this an Avx512 or Avx or Sse or K (opmask) instruction. +// Technically, K instructions would be considered under the VEX encoding umbrella, but due to +// the instruction table encoding had to be pulled out with the rest of the `INST5` definitions. // // Arguments: // ins - The instruction to check. @@ -46,7 +53,7 @@ bool emitter::IsSSEOrAVXInstruction(instruction ins) bool emitter::IsAvx512OrPriorInstruction(instruction ins) { // TODO-XArch-AVX512: Fix check once AVX512 instructions are added. - return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION); + return ((ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION)); } bool emitter::IsAVXOnlyInstruction(instruction ins) @@ -263,6 +270,15 @@ bool emitter::IsEvexEncodedInstruction(instruction ins) const case INS_vbroadcastf128: // INS_vbroadcastf32x4, INS_vbroadcastf64x2. case INS_vbroadcasti128: // INS_vbroadcasti32x4, INS_vbroadcasti64x2. + case INS_kmovb_msk: + case INS_kmovw_msk: + case INS_kmovd_msk: + case INS_kmovq_msk: + case INS_kmovb_gpr: + case INS_kmovw_gpr: + case INS_kmovd_gpr: + case INS_kmovq_gpr: + // TODO-XARCH-AVX512 these need to be encoded with the proper individual EVEX instructions (movdqu8, // movdqu16 etc) // For implementation speed, I have set it up so the standing instruction will default to the 32-bit operand @@ -1248,6 +1264,8 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) case INS_vpgatherqq: case INS_vgatherdpd: case INS_vgatherqpd: + case INS_vpmovw2m: + case INS_vpmovq2m: return true; default: break; @@ -1294,6 +1312,9 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) case INS_shlx: case INS_sarx: case INS_shrx: + case INS_kmovq_msk: + case INS_kmovq_gpr: + case INS_kmovd_msk: return true; default: return false; @@ -3478,6 +3499,10 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id) // Otherwise, it will be placed after the 4 byte encoding, making the total 5 bytes. // This would probably be better expressed as a different format or something? code_t code = insCodeRM(ins); + if (IsKInstruction(ins)) + { + code = AddVexPrefix(ins, code, EA_SIZE(id->idOpSize())); + } UNATIVE_OFFSET sz = emitGetAdjustedSize(id, code); @@ -5856,6 +5881,14 @@ bool emitter::IsMovInstruction(instruction ins) case INS_movupd: case INS_movups: case INS_movzx: + case INS_kmovb_msk: + case INS_kmovw_msk: + case INS_kmovd_msk: + case INS_kmovq_msk: + case INS_kmovb_gpr: + case INS_kmovw_gpr: + case INS_kmovd_gpr: + case INS_kmovq_gpr: { return true; } @@ -6006,6 +6039,19 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) } #endif // TARGET_AMD64 + case INS_kmovb_msk: + case INS_kmovw_msk: + case INS_kmovd_msk: + case INS_kmovq_msk: + case INS_kmovb_gpr: + case INS_kmovw_gpr: + case INS_kmovd_gpr: + case INS_kmovq_gpr: + { + hasSideEffect = true; + break; + } + default: { unreached(); @@ -6223,6 +6269,25 @@ void emitter::emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regN } #endif // TARGET_AMD64 + case INS_kmovb_msk: + case INS_kmovw_msk: + case INS_kmovd_msk: + case INS_kmovq_msk: + { + assert((isMaskReg(dstReg) || isMaskReg(srcReg)) && !isGeneralRegister(dstReg) && + !isGeneralRegister(srcReg)); + break; + } + + case INS_kmovb_gpr: + case INS_kmovw_gpr: + case INS_kmovd_gpr: + case INS_kmovq_gpr: + { + assert(isGeneralRegister(dstReg) || isGeneralRegister(srcReg)); + break; + } + default: { unreached(); @@ -9619,6 +9684,11 @@ const char* emitter::emitRegName(regNumber reg, emitAttr attr, bool varName) #ifdef TARGET_AMD64 char suffix = '\0'; + if (isMaskReg(reg)) + { + return rn; + } + switch (EA_SIZE(attr)) { case EA_64BYTE: @@ -13843,7 +13913,18 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { assert((ins != INS_movd) || (isFloatReg(reg1) != isFloatReg(reg2))); - if ((ins != INS_movd) || isFloatReg(reg1)) + if (ins == INS_kmovb_gpr || ins == INS_kmovw_gpr || ins == INS_kmovd_gpr || ins == INS_kmovq_gpr) + { + assert(!(isGeneralRegister(reg1) && isGeneralRegister(reg2))); + + code = insCodeRM(ins); + if (isGeneralRegister(reg1)) + { + // kmov r, k form, flip last byte of opcode from 0x92 to 0x93 + code |= 0x01; + } + } + else if ((ins != INS_movd) || isFloatReg(reg1)) { code = insCodeRM(ins); } @@ -18150,6 +18231,31 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; } #endif + + case INS_vpmovb2m: + case INS_vpmovw2m: + case INS_vpmovd2m: + case INS_vpmovq2m: + { + result.insLatency += PERFSCORE_LATENCY_1C; + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + break; + } + + case INS_kmovb_msk: + case INS_kmovw_msk: + case INS_kmovd_msk: + case INS_kmovq_msk: + case INS_kmovb_gpr: + case INS_kmovw_gpr: + case INS_kmovd_gpr: + case INS_kmovq_gpr: + { + result.insLatency += PERFSCORE_LATENCY_3C; + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + break; + } + default: // unhandled instruction insFmt combination perfScoreUnhandledInstruction(id, &result); diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 84474c8a590569..eb6ebf375bd769 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -23,6 +23,11 @@ inline static bool isDoubleReg(regNumber reg) return isFloatReg(reg); } +inline static bool isMaskReg(regNumber reg) +{ + return (reg >= REG_MASK_FIRST && reg <= REG_MASK_LAST); +} + /************************************************************************/ /* Routines that compute the size of / encode instructions */ /************************************************************************/ @@ -96,6 +101,7 @@ static bool IsAvx512OnlyInstruction(instruction ins); static bool IsFMAInstruction(instruction ins); static bool IsAVXVNNIInstruction(instruction ins); static bool IsBMIInstruction(instruction ins); +static bool IsKInstruction(instruction ins); static regNumber getBmiRegNumber(instruction ins); static regNumber getSseShiftRegNumber(instruction ins); diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 274821ef81b55e..77a7ab22dc19cc 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1671,6 +1671,54 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_AVX512F_MoveMaskSpecial: + { + op1Reg = op1->GetRegNum(); + regNumber maskReg = node->ExtractTempReg(RBM_ALLMASK); + + instruction maskIns; + instruction kmovIns; + + // TODO-XARCH-AVX512 note that this type/kmov combination assumes 512-bit vector types but would change + // if used for other vector lengths, i.e., TYPE_BYTE requires kmovq for for 512-bit vector, but kmovd + // for 256-bit vector. + switch (baseType) + { + case TYP_BYTE: + case TYP_UBYTE: + maskIns = INS_vpmovb2m; + kmovIns = INS_kmovq_gpr; + break; + case TYP_SHORT: + case TYP_USHORT: + maskIns = INS_vpmovw2m; + kmovIns = INS_kmovd_gpr; + break; + case TYP_INT: + case TYP_UINT: + case TYP_FLOAT: + maskIns = INS_vpmovd2m; + kmovIns = INS_kmovw_gpr; + break; + case TYP_DOUBLE: + case TYP_LONG: + case TYP_ULONG: + maskIns = INS_vpmovq2m; + kmovIns = INS_kmovb_gpr; + break; + default: + unreached(); + } + + // TODO-XARCH-AVX512 remove REG_K1 check when all K registers possible for + // allocation. + assert(emitter::isMaskReg(maskReg) && maskReg == REG_K1); + + emit->emitIns_R_R(maskIns, attr, maskReg, op1Reg); + emit->emitIns_Mov(kmovIns, EA_8BYTE, targetReg, maskReg, INS_FLAGS_DONT_CARE); + break; + } + default: unreached(); break; diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index d581ddc5a5711c..042f57891a4d5d 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -247,6 +247,8 @@ HARDWARE_INTRINSIC(Vector512, StoreAligned, HARDWARE_INTRINSIC(Vector512, StoreAlignedNonTemporal, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, StoreUnsafe, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, ExtractMostSignificantBits, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) + // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} @@ -884,6 +886,8 @@ HARDWARE_INTRINSIC(SSE2, UCOMISD, HARDWARE_INTRINSIC(SSE41, PTEST, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX512F, MoveMaskSpecial, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) + #endif // FEATURE_HW_INTRINSIC #undef HARDWARE_INTRINSIC diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 2ccbe8e0d3aefc..7feb5336ca0e5c 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1248,6 +1248,20 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector512_ExtractMostSignificantBits: + { + if (IsBaselineVector512IsaSupported()) + { + var_types simdType = getSIMDTypeForSize(simdSize); + + op1 = impSIMDPopStack(simdType); + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, NI_AVX512F_MoveMaskSpecial, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ false); + } + break; + } + case NI_Vector128_ExtractMostSignificantBits: case NI_Vector256_ExtractMostSignificantBits: { diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 8c5006a8b9d5ee..c5c8c1924ea3fb 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -101,7 +101,8 @@ const char* CodeGen::genInsDisplayName(emitter::instrDesc* id) static char buf[4][TEMP_BUFFER_LEN]; const char* retbuf; - if (GetEmitter()->IsVexEncodedInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins)) + if (GetEmitter()->IsVexEncodedInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins) && + !GetEmitter()->IsKInstruction(ins)) { sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "v%s", insName); retbuf = buf[curBuf]; diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index a7d02c5c2af9f7..e089f6195f0de1 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -627,6 +627,22 @@ INST3(shrx, "shrx", IUM_WR, BAD_CODE, BAD_CODE, INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST3(FIRST_K_INSTRUCTION, "FIRST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None ) + +INST3(kmovb_msk, "kmovb", IUM_WR, PACK3(0x66, 0x0F, 0x91), BAD_CODE, PACK3(0x66, 0x0F, 0x90), INS_TT_NONE, INS_FLAGS_None ) +INST3(kmovw_msk, "kmovw", IUM_WR, PACK2(0x0F, 0x91), BAD_CODE, PACK2(0x0F, 0x90), INS_TT_NONE, INS_FLAGS_None ) +INST3(kmovd_msk, "kmovd", IUM_WR, PACK3(0xF2, 0x0F, 0x91), BAD_CODE, PACK3(0xF2, 0x0F, 0x90), INS_TT_NONE, INS_FLAGS_None ) +INST3(kmovq_msk, "kmovq", IUM_WR, PACK3(0xF2, 0x0F, 0x91), BAD_CODE, PACK3(0xF2, 0x0F, 0x90), INS_TT_NONE, INS_FLAGS_None ) + + +INST3(kmovb_gpr, "kmovb", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0x66, 0x0F, 0x92), INS_TT_NONE, INS_FLAGS_None ) +INST3(kmovw_gpr, "kmovw", IUM_WR, BAD_CODE, BAD_CODE, PACK2(0x0F, 0x92), INS_TT_NONE, INS_FLAGS_None ) +INST3(kmovd_gpr, "kmovd", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0xF2, 0x0F, 0x92), INS_TT_NONE, INS_FLAGS_None ) +INST3(kmovq_gpr, "kmovq", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0xF2, 0x0F, 0x92), INS_TT_NONE, INS_FLAGS_None ) + +INST3(LAST_K_INSTRUCTION, "LAST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None ) + + INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) @@ -650,6 +666,11 @@ INST3(vinsertf32x8, "insertf32x8", IUM_WR, BAD_CODE, BAD_CODE, INST3(vinserti32x8, "inserti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE8, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values INST3(LAST_AVX512DQ_INSTRUCTION, "LAST_AVX512DQ_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST3(vpmovb2m, "vpmovb2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x29), INS_TT_NONE, Input_8Bit) +INST3(vpmovw2m, "vpmovw2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x29), INS_TT_NONE, Input_16Bit) +INST3(vpmovd2m, "vpmovd2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x39), INS_TT_NONE, Input_32Bit) +INST3(vpmovq2m, "vpmovq2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x39), INS_TT_NONE, Input_64Bit) + INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Scalar instructions in SSE4.2 diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index f98029fa82334a..991786468a4316 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -636,16 +636,20 @@ LinearScan::LinearScan(Compiler* theCompiler) , refPositions(theCompiler->getAllocator(CMK_LSRA_RefPosition)) , listNodePool(theCompiler) { +#if defined(TARGET_XARCH) + availableRegCount = ACTUAL_REG_COUNT; + #if defined(TARGET_AMD64) rbmAllFloat = compiler->rbmAllFloat; rbmFltCalleeTrash = compiler->rbmFltCalleeTrash; - availableRegCount = ACTUAL_REG_COUNT; +#endif if (!compiler->DoJitStressEvexEncoding()) { availableRegCount -= CNT_HIGHFLOAT; + availableRegCount -= CNT_MASK_REGS; } -#endif // TARGET_AMD64 +#endif // TARGET_XARCH regSelector = new (theCompiler, CMK_LSRA) RegisterSelection(this); firstColdLoc = MaxLocation; @@ -698,6 +702,9 @@ LinearScan::LinearScan(Compiler* theCompiler) availableFloatRegs = RBM_ALLFLOAT; availableDoubleRegs = RBM_ALLDOUBLE; +#if defined(TARGET_XARCH) + availableMaskRegs = RBM_ALLMASK; +#endif #if defined(TARGET_AMD64) || defined(TARGET_ARM64) if (compiler->opts.compDbgEnC) @@ -737,6 +744,12 @@ LinearScan::LinearScan(Compiler* theCompiler) { availableRegs[i] = &availableDoubleRegs; } +#ifdef TARGET_XARCH + else if (thisType == TYP_MASK) + { + availableRegs[i] = &availableMaskRegs; + } +#endif // TARGET_XARCH #endif // FEATURE_SIMD else { diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index d28a8d521d632b..6c8b961a44c7d5 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -35,6 +35,7 @@ const unsigned int RegisterTypeCount = 2; typedef var_types RegisterType; #define IntRegisterType TYP_INT #define FloatRegisterType TYP_FLOAT +#define MaskRegisterType TYP_MASK //------------------------------------------------------------------------ // regType: Return the RegisterType to use for a given type @@ -482,15 +483,21 @@ class RegRecord : public Referenceable } else #endif - if (emitter::isFloatReg(reg)) + if (emitter::isGeneralRegister(reg)) + { + assert(registerType == IntRegisterType); + } + else if (emitter::isFloatReg(reg)) { registerType = FloatRegisterType; } +#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) else { - // The constructor defaults to IntRegisterType - assert(emitter::isGeneralRegister(reg) && registerType == IntRegisterType); + assert(emitter::isMaskReg(reg)); + registerType = MaskRegisterType; } +#endif regNum = reg; isCalleeSave = ((RBM_CALLEE_SAVED & genRegMask(reg)) != 0); } @@ -1090,6 +1097,9 @@ class LinearScan : public LinearScanInterface RefPosition* defineNewInternalTemp(GenTree* tree, RegisterType regType, regMaskTP candidates); RefPosition* buildInternalIntRegisterDefForNode(GenTree* tree, regMaskTP internalCands = RBM_NONE); RefPosition* buildInternalFloatRegisterDefForNode(GenTree* tree, regMaskTP internalCands = RBM_NONE); +#if defined(FEATURE_SIMD) + RefPosition* buildInternalMaskRegisterDefForNode(GenTree* tree, regMaskTP internalCands = RBM_NONE); +#endif void buildInternalRegisterUses(); void writeLocalReg(GenTreeLclVar* lclNode, unsigned varNum, regNumber reg); @@ -1595,9 +1605,12 @@ class LinearScan : public LinearScanInterface // A temporary VarToRegMap used during the resolution of critical edges. VarToRegMap sharedCriticalVarToRegMap; - PhasedVar availableIntRegs; - PhasedVar availableFloatRegs; - PhasedVar availableDoubleRegs; + PhasedVar availableIntRegs; + PhasedVar availableFloatRegs; + PhasedVar availableDoubleRegs; +#if defined(TARGET_XARCH) + PhasedVar availableMaskRegs; +#endif PhasedVar* availableRegs[TYP_COUNT]; // Register mask of argument registers currently occupied because we saw a @@ -1938,10 +1951,11 @@ class LinearScan : public LinearScanInterface #endif // FEATURE_ARG_SPLIT int BuildLclHeap(GenTree* tree); +#if defined(TARGET_XARCH) + #if defined(TARGET_AMD64) regMaskTP rbmAllFloat; regMaskTP rbmFltCalleeTrash; - unsigned availableRegCount; regMaskTP get_RBM_ALLFLOAT() const { @@ -1951,11 +1965,15 @@ class LinearScan : public LinearScanInterface { return this->rbmFltCalleeTrash; } +#endif // TARGET_AMD64 + + unsigned availableRegCount; + unsigned get_AVAILABLE_REG_COUNT() const { return this->availableRegCount; } -#endif // TARGET_AMD64 +#endif // TARGET_XARCH //------------------------------------------------------------------------ // calleeSaveRegs: Get the set of callee-save registers of the given RegisterType diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 14f142d7908faa..a6fbce9b40cb4a 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -1393,6 +1393,16 @@ RefPosition* LinearScan::buildInternalFloatRegisterDefForNode(GenTree* tree, reg return defRefPosition; } +#if defined(FEATURE_SIMD) && defined(TARGET_XARCH) +RefPosition* LinearScan::buildInternalMaskRegisterDefForNode(GenTree* tree, regMaskTP internalCands) +{ + // The candidate set should contain only float registers. + assert((internalCands & ~availableMaskRegs) == RBM_NONE); + + return defineNewInternalTemp(tree, MaskRegisterType, internalCands); +} +#endif + //------------------------------------------------------------------------ // buildInternalRegisterUses - adds use positions for internal // registers required for tree node. diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index ba44a9fe65e4d8..c57e64981f69db 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2495,6 +2495,16 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou break; } + case NI_AVX512F_MoveMaskSpecial: + { + srcCount += BuildOperandUses(op1); + buildInternalMaskRegisterDefForNode(intrinsicTree); + setInternalRegsDelayFree = true; + + buildUses = false; + break; + } + default: { assert((intrinsicId > NI_HW_INTRINSIC_START) && (intrinsicId < NI_HW_INTRINSIC_END)); diff --git a/src/coreclr/jit/register.h b/src/coreclr/jit/register.h index ca90673e85adfe..7f2a0d47570f96 100644 --- a/src/coreclr/jit/register.h +++ b/src/coreclr/jit/register.h @@ -69,9 +69,18 @@ REGALIAS(EDI, RDI) #ifdef TARGET_AMD64 #define XMMBASE 16 #define XMMMASK(x) ((__int64)(1) << ((x)+XMMBASE)) + +#define KBASE 48 +#define KMASK(x) ((__int64)(1) << ((x)+KBASE)) + #else // !TARGET_AMD64 #define XMMBASE 8 #define XMMMASK(x) ((__int32)(1) << ((x)+XMMBASE)) + +#define KBASE 16 +#define KMASK(x) ((__int32)(1) << ((x)+KBASE)) + + #endif // !TARGET_AMD64 REGDEF(XMM0, 0+XMMBASE, XMMMASK(0), "mm0" ) @@ -83,9 +92,7 @@ REGDEF(XMM5, 5+XMMBASE, XMMMASK(5), "mm5" ) REGDEF(XMM6, 6+XMMBASE, XMMMASK(6), "mm6" ) REGDEF(XMM7, 7+XMMBASE, XMMMASK(7), "mm7" ) -#ifdef TARGET_X86 -REGDEF(STK, 8+XMMBASE, 0x0000, "STK" ) -#else // !TARGET_X86 +#ifdef TARGET_AMD64 REGDEF(XMM8, 8+XMMBASE, XMMMASK(8), "mm8" ) REGDEF(XMM9, 9+XMMBASE, XMMMASK(9), "mm9" ) REGDEF(XMM10, 10+XMMBASE, XMMMASK(10), "mm10" ) @@ -113,9 +120,18 @@ REGDEF(XMM29, 29+XMMBASE, XMMMASK(29), "mm29" ) REGDEF(XMM30, 30+XMMBASE, XMMMASK(30), "mm30" ) REGDEF(XMM31, 31+XMMBASE, XMMMASK(31), "mm31" ) -REGDEF(STK, 32+XMMBASE, 0x0000, "STK" ) +#endif // !TARGET_AMD64 + +REGDEF(K0, 0+KBASE, KMASK(0), "k0" ) +REGDEF(K1, 1+KBASE, KMASK(1), "k1" ) +REGDEF(K2, 2+KBASE, KMASK(2), "k2" ) +REGDEF(K3, 3+KBASE, KMASK(3), "k3" ) +REGDEF(K4, 4+KBASE, KMASK(4), "k4" ) +REGDEF(K5, 5+KBASE, KMASK(5), "k5" ) +REGDEF(K6, 6+KBASE, KMASK(6), "k6" ) +REGDEF(K7, 7+KBASE, KMASK(7), "k7" ) -#endif // !TARGET_X86 +REGDEF(STK, 8+KBASE, 0x0000, "STK" ) #elif defined(TARGET_ARM) #include "registerarm.h" diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index 8baf645453adf5..22e3c106a4ae24 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -190,7 +190,7 @@ enum _regMask_enum : unsigned #error Unsupported target architecture #endif -#if defined(TARGET_AMD64) +#if defined(TARGET_XARCH) // AVAILABLE_REG_COUNT is defined to be dynamic, based on whether AVX-512 high registers are available. #define AVAILABLE_REG_COUNT get_AVAILABLE_REG_COUNT() #else @@ -682,6 +682,11 @@ inline bool isFloatRegType(var_types type) return varTypeUsesFloatReg(type); } +inline bool isMaskReg(var_types type) +{ + return varTypeIsMask(type); +} + // If the WINDOWS_AMD64_ABI is defined make sure that TARGET_AMD64 is also defined. #if defined(WINDOWS_AMD64_ABI) #if !defined(TARGET_AMD64) diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index ac3f0ca7e8c027..6c56df1106923a 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -91,6 +91,13 @@ #define REG_FP_LAST REG_XMM31 #define FIRST_FP_ARGREG REG_XMM0 + #define REG_MASK_FIRST REG_K0 + #define REG_MASK_LAST REG_K7 + + #define RBM_ALLMASK RBM_K1 + + #define CNT_MASK_REGS 8 + #ifdef UNIX_AMD64_ABI #define LAST_FP_ARGREG REG_XMM7 #else // !UNIX_AMD64_ABI diff --git a/src/coreclr/jit/targetx86.h b/src/coreclr/jit/targetx86.h index dffd6adf2efb08..0499abe7d8149b 100644 --- a/src/coreclr/jit/targetx86.h +++ b/src/coreclr/jit/targetx86.h @@ -74,6 +74,11 @@ #define REG_FP_FIRST REG_XMM0 #define REG_FP_LAST REG_XMM7 + + #define REG_MASK_FIRST REG_K0 + #define REG_MASK_LAST REG_K7 + #define CNT_MASK_REGS 8 + #define FIRST_FP_ARGREG REG_XMM0 #define LAST_FP_ARGREG REG_XMM3 #define REG_FLTARG_0 REG_XMM0 @@ -91,6 +96,10 @@ #define RBM_ALLFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7) #define RBM_ALLDOUBLE RBM_ALLFLOAT + #define RBM_ALLMASK REG_K1 + + #define CNT_HIGHFLOAT 0 + // TODO-CQ: Currently we are following the x86 ABI for SSE2 registers. // This should be reconsidered. #define RBM_FLT_CALLEE_SAVED RBM_NONE diff --git a/src/coreclr/jit/typelist.h b/src/coreclr/jit/typelist.h index 6cb1cb0e466539..cff413de248199 100644 --- a/src/coreclr/jit/typelist.h +++ b/src/coreclr/jit/typelist.h @@ -60,6 +60,7 @@ DEF_TP(SIMD16 ,"simd16" , TYP_SIMD16, TI_STRUCT,16,16, 16, 4,16, VTF_S|VTF_ #if defined(TARGET_XARCH) DEF_TP(SIMD32 ,"simd32" , TYP_SIMD32, TI_STRUCT,32,32, 32, 8,16, VTF_S|VTF_VEC) DEF_TP(SIMD64 ,"simd64" , TYP_SIMD64, TI_STRUCT,64,64, 64, 16,16, VTF_S|VTF_VEC) +DEF_TP(MASK ,"mask" , TYP_MASK, TI_STRUCT, 8, 8, 8, 2, 8, VTF_ANY) #endif // TARGET_XARCH #endif // FEATURE_SIMD @@ -72,4 +73,4 @@ DEF_TP(UNKNOWN ,"unknown" ,TYP_UNKNOWN, TI_ERROR, 0, 0, 0, 0, 0, VTF_ANY) #undef PS #undef PST #undef VTF_I32 -#undef VTF_I64 +#undef VTF_I64 \ No newline at end of file diff --git a/src/coreclr/jit/vartype.h b/src/coreclr/jit/vartype.h index 3c05a169933176..f5c8d42f2b204f 100644 --- a/src/coreclr/jit/vartype.h +++ b/src/coreclr/jit/vartype.h @@ -70,7 +70,17 @@ inline bool varTypeIsSIMD(T vt) #else // Always return false if FEATURE_SIMD is not enabled return false; -#endif // !FEATURE_SIMD +#endif +} + +template +inline bool varTypeIsMask(T vt) +{ +#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) + return (TypeGet(vt) == TYP_MASK); +#else // FEATURE_SIMD + return false; +#endif } template