From b8652fbcbbde830ec2f48ff165e5706fb607d02b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 1 Apr 2022 16:58:45 +0100 Subject: [PATCH] [X86] Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y)) (RECOMMITTED) As noticed on PR39174, if we're extracting a single non-constant bit index, then try to use BT+SETCC instead to avoid messing around moving the shift amount to the ECX register, using slow x86 shift ops etc. Recommitted with a fix to ensure we zext/trunc the SETCC result to the original type. Differential Revision: https://reviews.llvm.org/D122891 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 13 ++++++ llvm/test/CodeGen/X86/setcc.ll | 56 ++++++++++++++++--------- 2 files changed, 50 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f78a010ea40b..8ad7d9cf49b4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -47329,6 +47329,19 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget)) return R; + // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant + // avoids slow variable shift (moving shift amount to ECX etc.) + if (isOneConstant(N1) && N0->hasOneUse()) { + SDValue Src = N0; + while ((Src.getOpcode() == ISD::ZERO_EXTEND || + Src.getOpcode() == ISD::TRUNCATE) && + Src.getOperand(0)->hasOneUse()) + Src = Src.getOperand(0); + if (Src.getOpcode() == ISD::SRL && !isa(Src.getOperand(1))) + if (SDValue BT = getBT(Src.getOperand(0), Src.getOperand(1), dl, DAG)) + return DAG.getZExtOrTrunc(getSETCC(X86::COND_B, BT, dl, DAG), dl, VT); + } + if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { // Attempt to recursively combine a bitmask AND with shuffles. SDValue Op(N, 0); diff --git a/llvm/test/CodeGen/X86/setcc.ll b/llvm/test/CodeGen/X86/setcc.ll index 57431887f58c..eede7310af87 100644 --- a/llvm/test/CodeGen/X86/setcc.ll +++ b/llvm/test/CodeGen/X86/setcc.ll @@ -139,19 +139,17 @@ define zeroext i1 @t6(i32 %a) #0 { define zeroext i1 @t7(i32 %0) { ; X86-LABEL: t7: ; X86: ## %bb.0: -; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: movb $19, %al -; X86-NEXT: shrb %cl, %al -; X86-NEXT: andb $1, %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $19, %ecx +; X86-NEXT: btl %eax, %ecx +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: t7: ; X64: ## %bb.0: -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: movb $19, %al -; X64-NEXT: ## kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrb %cl, %al -; X64-NEXT: andb $1, %al +; X64-NEXT: movl $19, %eax +; X64-NEXT: btl %edi, %eax +; X64-NEXT: setb %al ; X64-NEXT: retq %2 = trunc i32 %0 to i5 %3 = lshr i5 -13, %2 @@ -163,20 +161,16 @@ define zeroext i1 @t7(i32 %0) { define zeroext i1 @t8(i8 %0, i8 %1) { ; X86-LABEL: t8: ; X86: ## %bb.0: -; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-NEXT: shrb %cl, %al -; X86-NEXT: andb $1, %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: btl %eax, %ecx +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: t8: ; X64: ## %bb.0: -; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl %edi, %eax -; X64-NEXT: ## kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrb %cl, %al -; X64-NEXT: andb $1, %al -; X64-NEXT: ## kill: def $al killed $al killed $eax +; X64-NEXT: btl %esi, %edi +; X64-NEXT: setb %al ; X64-NEXT: retq %3 = lshr i8 %0, %1 %4 = and i8 %3, 1 @@ -184,6 +178,30 @@ define zeroext i1 @t8(i8 %0, i8 %1) { ret i1 %5 } +define i64 @t9(i32 %0, i32 %1) { +; X86-LABEL: t9: +; X86: ## %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: btl %edx, %ecx +; X86-NEXT: setb %al +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: retl +; +; X64-LABEL: t9: +; X64: ## %bb.0: +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: btl %esi, %edi +; X64-NEXT: setb %al +; X64-NEXT: retq + %3 = lshr i32 %0, %1 + %4 = and i32 %3, 1 + %5 = icmp ne i32 %4, 0 + %6 = zext i1 %5 to i64 + ret i64 %6 +} + define i16 @shift_and(i16 %a) { ; X86-LABEL: shift_and: ; X86: ## %bb.0: