From 8273939694f9c36bcc8546f86236064ba1ff5073 Mon Sep 17 00:00:00 2001 From: SingleAccretion <62474226+SingleAccretion@users.noreply.github.com> Date: Sat, 6 Nov 2021 06:10:03 +0300 Subject: [PATCH] Implement `GT_AND_NOT` for ARM/ARM64 (#59881) * Move late arithmetic to its own section in gtlist * Implement AND_NOT for AAarch * Delete the unnecessary platfrom-specific methods They were a leftover from some previous work. * Mention the SIMD origins of AND_NOT --- src/coreclr/jit/codegenarm.cpp | 7 ++-- src/coreclr/jit/codegenarm64.cpp | 11 +++++-- src/coreclr/jit/codegenarmarch.cpp | 1 + src/coreclr/jit/emitarm.cpp | 2 +- src/coreclr/jit/gtlist.h | 45 ++++++++++++++------------ src/coreclr/jit/lower.cpp | 52 ++++++++++++++++++++++++++++-- src/coreclr/jit/lower.h | 1 + src/coreclr/jit/lsraarm.cpp | 1 + src/coreclr/jit/lsraarm64.cpp | 1 + 9 files changed, 93 insertions(+), 28 deletions(-) diff --git a/src/coreclr/jit/codegenarm.cpp b/src/coreclr/jit/codegenarm.cpp index 23beb66352978..89c3d4f88328e 100644 --- a/src/coreclr/jit/codegenarm.cpp +++ b/src/coreclr/jit/codegenarm.cpp @@ -313,8 +313,8 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode) var_types targetType = treeNode->TypeGet(); emitter* emit = GetEmitter(); - assert(oper == GT_ADD || oper == GT_SUB || oper == GT_MUL || oper == GT_ADD_LO || oper == GT_ADD_HI || - oper == GT_SUB_LO || oper == GT_SUB_HI || oper == GT_OR || oper == GT_XOR || oper == GT_AND); + assert(treeNode->OperIs(GT_ADD, GT_SUB, GT_MUL, GT_ADD_LO, GT_ADD_HI, GT_SUB_LO, GT_SUB_HI, GT_OR, GT_XOR, GT_AND, + GT_AND_NOT)); GenTree* op1 = treeNode->gtGetOp1(); GenTree* op2 = treeNode->gtGetOp2(); @@ -671,6 +671,9 @@ instruction CodeGen::genGetInsForOper(genTreeOps oper, var_types type) case GT_AND: ins = INS_AND; break; + case GT_AND_NOT: + ins = INS_bic; + break; case GT_MUL: ins = INS_MUL; break; diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 8f4db06485bb9..aa609801ac645 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -1817,7 +1817,7 @@ void CodeGen::genCodeForMulHi(GenTreeOp* treeNode) genProduceReg(treeNode); } -// Generate code for ADD, SUB, MUL, DIV, UDIV, AND, OR and XOR +// Generate code for ADD, SUB, MUL, DIV, UDIV, AND, AND_NOT, OR and XOR // This method is expected to have called genConsumeOperands() before calling it. void CodeGen::genCodeForBinary(GenTreeOp* treeNode) { @@ -1826,8 +1826,7 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode) var_types targetType = treeNode->TypeGet(); emitter* emit = GetEmitter(); - assert(oper == GT_ADD || oper == GT_SUB || oper == GT_MUL || oper == GT_DIV || oper == GT_UDIV || oper == GT_AND || - oper == GT_OR || oper == GT_XOR); + assert(treeNode->OperIs(GT_ADD, GT_SUB, GT_MUL, GT_DIV, GT_UDIV, GT_AND, GT_AND_NOT, GT_OR, GT_XOR)); GenTree* op1 = treeNode->gtGetOp1(); GenTree* op2 = treeNode->gtGetOp2(); @@ -1846,6 +1845,9 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode) case GT_AND: ins = INS_ands; break; + case GT_AND_NOT: + ins = INS_bics; + break; default: noway_assert(!"Unexpected BinaryOp with GTF_SET_FLAGS set"); } @@ -3119,6 +3121,9 @@ instruction CodeGen::genGetInsForOper(genTreeOps oper, var_types type) case GT_AND: ins = INS_and; break; + case GT_AND_NOT: + ins = INS_bic; + break; case GT_DIV: ins = INS_sdiv; break; diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 4f6f7890f50d9..8ae5ebd143725 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -213,6 +213,7 @@ void CodeGen::genCodeForTreeNode(GenTree* treeNode) case GT_OR: case GT_XOR: case GT_AND: + case GT_AND_NOT: assert(varTypeIsIntegralOrI(treeNode)); FALLTHROUGH; diff --git a/src/coreclr/jit/emitarm.cpp b/src/coreclr/jit/emitarm.cpp index 7ed8421074192..1a955ce88211f 100644 --- a/src/coreclr/jit/emitarm.cpp +++ b/src/coreclr/jit/emitarm.cpp @@ -8095,7 +8095,7 @@ regNumber emitter::emitInsTernary(instruction ins, emitAttr attr, GenTree* dst, if (dst->gtSetFlags()) { assert((ins == INS_add) || (ins == INS_adc) || (ins == INS_sub) || (ins == INS_sbc) || (ins == INS_and) || - (ins == INS_orr) || (ins == INS_eor) || (ins == INS_orn)); + (ins == INS_orr) || (ins == INS_eor) || (ins == INS_orn) || (ins == INS_bic)); flags = INS_FLAGS_SET; } diff --git a/src/coreclr/jit/gtlist.h b/src/coreclr/jit/gtlist.h index f818fb6c7c93f..4ef5792a6909e 100644 --- a/src/coreclr/jit/gtlist.h +++ b/src/coreclr/jit/gtlist.h @@ -126,32 +126,12 @@ GTNODE(UMOD , GenTreeOp ,0,GTK_BINOP) GTNODE(OR , GenTreeOp ,1,(GTK_BINOP|GTK_LOGOP)) GTNODE(XOR , GenTreeOp ,1,(GTK_BINOP|GTK_LOGOP)) GTNODE(AND , GenTreeOp ,1,(GTK_BINOP|GTK_LOGOP)) -GTNODE(AND_NOT , GenTreeOp ,0,GTK_BINOP) GTNODE(LSH , GenTreeOp ,0,GTK_BINOP) GTNODE(RSH , GenTreeOp ,0,GTK_BINOP) GTNODE(RSZ , GenTreeOp ,0,GTK_BINOP) GTNODE(ROL , GenTreeOp ,0,GTK_BINOP) GTNODE(ROR , GenTreeOp ,0,GTK_BINOP) -GTNODE(INC_SATURATE , GenTreeOp ,0,GTK_UNOP) // saturating increment, used in division by a constant (LowerUnsignedDivOrMod) - -// Returns high bits (top N bits of the 2N bit result of an NxN multiply) -// GT_MULHI is used in division by a constant (LowerUnsignedDivOrMod). We turn -// the div into a MULHI + some adjustments. In codegen, we only use the -// results of the high register, and we drop the low results. -GTNODE(MULHI , GenTreeOp ,1,GTK_BINOP) - -// A mul that returns the 2N bit result of an NxN multiply. This op is used for -// multiplies that take two ints and return a long result. For 32 bit targets, -// all other multiplies with long results are morphed into helper calls. -// It is similar to GT_MULHI, the difference being that GT_MULHI drops the lo -// part of the result, whereas GT_MUL_LONG keeps both parts of the result. -// MUL_LONG is also used on ARM64, where 64 bit multiplication is more expensive. -#if !defined(TARGET_64BIT) -GTNODE(MUL_LONG , GenTreeMultiRegOp ,1,GTK_BINOP) -#elif defined(TARGET_ARM64) -GTNODE(MUL_LONG , GenTreeOp ,1,GTK_BINOP) -#endif GTNODE(ASG , GenTreeOp ,0,(GTK_BINOP|GTK_NOTLIR)) GTNODE(EQ , GenTreeOp ,0,(GTK_BINOP|GTK_RELOP)) @@ -220,6 +200,31 @@ GTNODE(SIMD , GenTreeSIMD ,0,(GTK_BINOP|GTK_EXOP)) // SIM GTNODE(HWINTRINSIC , GenTreeHWIntrinsic ,0,(GTK_BINOP|GTK_EXOP)) // hardware intrinsics #endif // FEATURE_HW_INTRINSICS +//----------------------------------------------------------------------------- +// Backend-specific arithmetic nodes: +//----------------------------------------------------------------------------- + +GTNODE(INC_SATURATE , GenTreeOp ,0,GTK_UNOP) // saturating increment, used in division by a constant (LowerUnsignedDivOrMod) + +// Returns high bits (top N bits of the 2N bit result of an NxN multiply) +// GT_MULHI is used in division by a constant (LowerUnsignedDivOrMod). We turn +// the div into a MULHI + some adjustments. In codegen, we only use the +// results of the high register, and we drop the low results. +GTNODE(MULHI , GenTreeOp ,1,GTK_BINOP) + +// A mul that returns the 2N bit result of an NxN multiply. This op is used for +// multiplies that take two ints and return a long result. For 32 bit targets, +// all other multiplies with long results are morphed into helper calls. +// It is similar to GT_MULHI, the difference being that GT_MULHI drops the lo +// part of the result, whereas GT_MUL_LONG keeps both parts of the result. +// MUL_LONG is also used on ARM64, where 64 bit multiplication is more expensive. +#if !defined(TARGET_64BIT) +GTNODE(MUL_LONG , GenTreeMultiRegOp ,1,GTK_BINOP) +#elif defined(TARGET_ARM64) +GTNODE(MUL_LONG , GenTreeOp ,1,GTK_BINOP) +#endif +// AndNot - emitted on ARM/ARM64 as the BIC instruction. Also used for creating AndNot HWINTRINSIC vector nodes in a cross-ISA manner. +GTNODE(AND_NOT , GenTreeOp ,0,GTK_BINOP) //----------------------------------------------------------------------------- // LIR specific compare and conditional branch/set nodes: //----------------------------------------------------------------------------- diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index ea76536df60b3..5bbeaef2a9ff4 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -139,8 +139,7 @@ GenTree* Lowering::LowerNode(GenTree* node) case GT_AND: case GT_OR: case GT_XOR: - ContainCheckBinary(node->AsOp()); - break; + return LowerBinaryArithmetic(node->AsOp()); case GT_MUL: case GT_MULHI: @@ -5104,6 +5103,55 @@ GenTree* Lowering::LowerAdd(GenTreeOp* node) return nullptr; } +//------------------------------------------------------------------------ +// LowerBinaryArithmetic: lowers the given binary arithmetic node. +// +// Recognizes opportunities for using target-independent "combined" nodes +// (currently AND_NOT on ARMArch). Performs containment checks. +// +// Arguments: +// node - the arithmetic node to lower +// +// Returns: +// The next node to lower. +// +GenTree* Lowering::LowerBinaryArithmetic(GenTreeOp* node) +{ + // TODO-CQ-XArch: support BMI2 "andn" in codegen and condition + // this logic on the support for the instruction set on XArch. + CLANG_FORMAT_COMMENT_ANCHOR; + +#ifdef TARGET_ARMARCH + if (comp->opts.OptimizationEnabled() && node->OperIs(GT_AND)) + { + GenTree* opNode = nullptr; + GenTree* notNode = nullptr; + if (node->gtGetOp1()->OperIs(GT_NOT)) + { + notNode = node->gtGetOp1(); + opNode = node->gtGetOp2(); + } + else if (node->gtGetOp2()->OperIs(GT_NOT)) + { + notNode = node->gtGetOp2(); + opNode = node->gtGetOp1(); + } + + if (notNode != nullptr) + { + node->gtOp1 = opNode; + node->gtOp2 = notNode->AsUnOp()->gtGetOp1(); + node->ChangeOper(GT_AND_NOT); + BlockRange().Remove(notNode); + } + } +#endif // TARGET_ARMARCH + + ContainCheckBinary(node); + + return node->gtNext; +} + //------------------------------------------------------------------------ // LowerUnsignedDivOrMod: Lowers a GT_UDIV/GT_UMOD node. // diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index b64f4a6410944..9310f8182bad8 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -297,6 +297,7 @@ class Lowering final : public Phase void LowerStoreIndir(GenTreeStoreInd* node); GenTree* LowerAdd(GenTreeOp* node); GenTree* LowerMul(GenTreeOp* mul); + GenTree* LowerBinaryArithmetic(GenTreeOp* node); bool LowerUnsignedDivOrMod(GenTreeOp* divMod); GenTree* LowerConstIntDivOrMod(GenTree* node); GenTree* LowerSignedDivOrMod(GenTree* node); diff --git a/src/coreclr/jit/lsraarm.cpp b/src/coreclr/jit/lsraarm.cpp index b4c61c78c222f..0cea6e1764e6a 100644 --- a/src/coreclr/jit/lsraarm.cpp +++ b/src/coreclr/jit/lsraarm.cpp @@ -368,6 +368,7 @@ int LinearScan::BuildNode(GenTree* tree) FALLTHROUGH; case GT_AND: + case GT_AND_NOT: case GT_OR: case GT_XOR: case GT_LSH: diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index d59c8e913b02e..0b52915af1ab4 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -268,6 +268,7 @@ int LinearScan::BuildNode(GenTree* tree) FALLTHROUGH; case GT_AND: + case GT_AND_NOT: case GT_OR: case GT_XOR: case GT_LSH: