Resolve formatting changes.
This commit is contained in:
parent
7b2e701906
commit
ecbed4e0ab
|
|
@ -0,0 +1,478 @@
|
||||||
|
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
|
||||||
|
index 3357b0e1a6e..1acbaf080a5 100644
|
||||||
|
--- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
|
||||||
|
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
|
||||||
|
@@ -286,6 +286,18 @@ m_GAShr(const LHS &L, const RHS &R) {
|
||||||
|
return BinaryOp_match<LHS, RHS, TargetOpcode::G_ASHR, false>(L, R);
|
||||||
|
}
|
||||||
|
|
||||||
|
+template <typename LHS, typename RHS>
|
||||||
|
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_SMAX, false>
|
||||||
|
+m_GSMax(const LHS &L, const RHS &R) {
|
||||||
|
+ return BinaryOp_match<LHS, RHS, TargetOpcode::G_SMAX, false>(L, R);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+template <typename LHS, typename RHS>
|
||||||
|
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_SMIN, false>
|
||||||
|
+m_GSMin(const LHS &L, const RHS &R) {
|
||||||
|
+ return BinaryOp_match<LHS, RHS, TargetOpcode::G_SMIN, false>(L, R);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
// Helper for unary instructions (G_[ZSA]EXT/G_TRUNC) etc
|
||||||
|
template <typename SrcTy, unsigned Opcode> struct UnaryOp_match {
|
||||||
|
SrcTy L;
|
||||||
|
@@ -440,6 +452,7 @@ struct TernaryOp_match {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Src0Ty, typename Src1Ty, typename Src2Ty>
|
||||||
|
inline TernaryOp_match<Src0Ty, Src1Ty, Src2Ty,
|
||||||
|
TargetOpcode::G_INSERT_VECTOR_ELT>
|
||||||
|
@@ -448,6 +461,13 @@ m_GInsertVecElt(const Src0Ty &Src0, const Src1Ty &Src1, const Src2Ty &Src2) {
|
||||||
|
TargetOpcode::G_INSERT_VECTOR_ELT>(Src0, Src1, Src2);
|
||||||
|
}
|
||||||
|
|
||||||
|
+template <typename Src0Ty, typename Src1Ty, typename Src2Ty>
|
||||||
|
+inline TernaryOp_match<Src0Ty, Src1Ty, Src2Ty, TargetOpcode::G_SELECT>
|
||||||
|
+m_GISelect(const Src0Ty &Src0, const Src1Ty &Src1, const Src2Ty &Src2) {
|
||||||
|
+ return TernaryOp_match<Src0Ty, Src1Ty, Src2Ty, TargetOpcode::G_SELECT>(
|
||||||
|
+ Src0, Src1, Src2);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
/// Matches a register negated by a G_SUB.
|
||||||
|
/// G_SUB 0, %negated_reg
|
||||||
|
template <typename SrcTy>
|
||||||
|
@@ -464,7 +484,8 @@ m_Not(const SrcTy &&Src) {
|
||||||
|
return m_GXor(Src, m_AllOnesInt());
|
||||||
|
}
|
||||||
|
|
||||||
|
-} // namespace GMIPatternMatch
|
||||||
|
+
|
||||||
|
+} // namespace MIPatternMatch
|
||||||
|
} // namespace llvm
|
||||||
|
|
||||||
|
#endif
|
||||||
|
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
|
||||||
|
index a8399176bb4..b6a6fb3e77d 100644
|
||||||
|
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
|
||||||
|
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
|
||||||
|
@@ -37,13 +37,21 @@ def cvt_f32_ubyteN : GICombineRule<
|
||||||
|
[{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]),
|
||||||
|
(apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>;
|
||||||
|
|
||||||
|
+def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPreLegalizerCombinerHelper::ClampI64ToI16MatchInfo">;
|
||||||
|
+
|
||||||
|
+def clamp_i64_to_i16 : GICombineRule<
|
||||||
|
+ (defs root:$clamp_i64_to_i16, clamp_i64_to_i16_matchdata:$matchinfo),
|
||||||
|
+ (match (wip_match_opcode G_TRUNC):$clamp_i64_to_i16,
|
||||||
|
+ [{ return PreLegalizerHelper.matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, *MF, ${matchinfo}); }]),
|
||||||
|
+ (apply [{ PreLegalizerHelper.applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>;
|
||||||
|
+
|
||||||
|
// Combines which should only apply on SI/VI
|
||||||
|
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
|
||||||
|
|
||||||
|
-
|
||||||
|
def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
|
||||||
|
- "AMDGPUGenPreLegalizerCombinerHelper", [all_combines]> {
|
||||||
|
+ "AMDGPUGenPreLegalizerCombinerHelper", [all_combines, clamp_i64_to_i16]> {
|
||||||
|
let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
|
||||||
|
+ let StateClass = "AMDGPUPreLegalizerCombinerHelperState";
|
||||||
|
}
|
||||||
|
|
||||||
|
def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
|
||||||
|
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
|
||||||
|
index 3fb5eec9f77..d349bade1e6 100644
|
||||||
|
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
|
||||||
|
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
|
||||||
|
@@ -11,8 +11,9 @@
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
-#include "AMDGPUTargetMachine.h"
|
||||||
|
#include "AMDGPULegalizerInfo.h"
|
||||||
|
+#include "AMDGPUTargetMachine.h"
|
||||||
|
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
||||||
|
#include "llvm/CodeGen/GlobalISel/Combiner.h"
|
||||||
|
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
|
||||||
|
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
|
||||||
|
@@ -22,7 +23,6 @@
|
||||||
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||||
|
#include "llvm/CodeGen/TargetPassConfig.h"
|
||||||
|
#include "llvm/Support/Debug.h"
|
||||||
|
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
||||||
|
|
||||||
|
#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
+
|
||||||
|
class AMDGPUPostLegalizerCombinerHelperState {
|
||||||
|
protected:
|
||||||
|
CombinerHelper &Helper;
|
||||||
|
@@ -331,6 +332,7 @@ public:
|
||||||
|
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||||
|
|
||||||
|
void getAnalysisUsage(AnalysisUsage &AU) const override;
|
||||||
|
+
|
||||||
|
private:
|
||||||
|
bool IsOptNone;
|
||||||
|
};
|
||||||
|
@@ -350,7 +352,7 @@ void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
|
||||||
|
}
|
||||||
|
|
||||||
|
AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
|
||||||
|
- : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
|
||||||
|
+ : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
|
||||||
|
initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -364,8 +366,8 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
|
||||||
|
MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
|
||||||
|
|
||||||
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
||||||
|
- const AMDGPULegalizerInfo *LI
|
||||||
|
- = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
|
||||||
|
+ const AMDGPULegalizerInfo *LI =
|
||||||
|
+ static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
|
||||||
|
|
||||||
|
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
|
||||||
|
MachineDominatorTree *MDT =
|
||||||
|
@@ -378,8 +380,8 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
|
||||||
|
|
||||||
|
char AMDGPUPostLegalizerCombiner::ID = 0;
|
||||||
|
INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
|
||||||
|
- "Combine AMDGPU machine instrs after legalization",
|
||||||
|
- false, false)
|
||||||
|
+ "Combine AMDGPU machine instrs after legalization", false,
|
||||||
|
+ false)
|
||||||
|
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
|
||||||
|
INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
|
||||||
|
INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
|
||||||
|
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
|
||||||
|
index aea148e9107..7fca3c52bba 100644
|
||||||
|
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
|
||||||
|
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
|
||||||
|
@@ -11,6 +11,7 @@
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
+#include "AMDGPULegalizerInfo.h"
|
||||||
|
#include "AMDGPUTargetMachine.h"
|
||||||
|
#include "llvm/CodeGen/GlobalISel/Combiner.h"
|
||||||
|
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
|
||||||
|
@@ -28,6 +29,154 @@
|
||||||
|
using namespace llvm;
|
||||||
|
using namespace MIPatternMatch;
|
||||||
|
|
||||||
|
+class AMDGPUPreLegalizerCombinerHelper {
|
||||||
|
+protected:
|
||||||
|
+ MachineIRBuilder &B;
|
||||||
|
+ MachineFunction &MF;
|
||||||
|
+ MachineRegisterInfo &MRI;
|
||||||
|
+ CombinerHelper &Helper;
|
||||||
|
+
|
||||||
|
+public:
|
||||||
|
+ AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
|
||||||
|
+ : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
|
||||||
|
+
|
||||||
|
+ struct ClampI64ToI16MatchInfo {
|
||||||
|
+ int64_t Cmp1;
|
||||||
|
+ int64_t Cmp2;
|
||||||
|
+ Register Origin;
|
||||||
|
+ };
|
||||||
|
+
|
||||||
|
+ bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
|
||||||
|
+ MachineFunction &MF,
|
||||||
|
+ ClampI64ToI16MatchInfo &MatchInfo);
|
||||||
|
+
|
||||||
|
+ void applyClampI64ToI16(MachineInstr &MI,
|
||||||
|
+ const ClampI64ToI16MatchInfo &MatchInfo);
|
||||||
|
+};
|
||||||
|
+
|
||||||
|
+bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
|
||||||
|
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
|
||||||
|
+ ClampI64ToI16MatchInfo &MatchInfo) {
|
||||||
|
+ assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
|
||||||
|
+
|
||||||
|
+ // Try to find a pattern where an i64 value should get clamped to short.
|
||||||
|
+ const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
|
||||||
|
+ if (SrcType != LLT::scalar(64))
|
||||||
|
+ return false;
|
||||||
|
+
|
||||||
|
+ const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
|
||||||
|
+ if (DstType != LLT::scalar(16))
|
||||||
|
+ return false;
|
||||||
|
+
|
||||||
|
+ LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16\n");
|
||||||
|
+
|
||||||
|
+ Register Base;
|
||||||
|
+
|
||||||
|
+ // Try to match a combination of min / max MIR opcodes.
|
||||||
|
+ if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
|
||||||
|
+ if (!mi_match(Base, MRI, m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
|
||||||
|
+ return false;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
|
||||||
|
+ if (!mi_match(Base, MRI, m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
|
||||||
|
+ return false;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ const auto Cmp1 = MatchInfo.Cmp1;
|
||||||
|
+ const auto Cmp2 = MatchInfo.Cmp2;
|
||||||
|
+ const auto Diff = std::abs(Cmp2 - Cmp1);
|
||||||
|
+
|
||||||
|
+ // If the difference between both comparison values is 0 or 1, there is no need to clamp.
|
||||||
|
+ if (Diff == 0 || Diff == 1)
|
||||||
|
+ return false;
|
||||||
|
+
|
||||||
|
+ const int64_t Min = std::numeric_limits<int16_t>::min();
|
||||||
|
+ const int64_t Max = std::numeric_limits<int16_t>::max();
|
||||||
|
+
|
||||||
|
+ // Check if the comparison values are between SHORT_MIN and SHORT_MAX.
|
||||||
|
+ return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
|
||||||
|
+ (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+// We want to find a combination of instructions that
|
||||||
|
+// gets generated when an i64 gets clamped to i16.
|
||||||
|
+// The corresponding pattern is:
|
||||||
|
+// G_MAX / G_MAX for i16 <= G_TRUNC i64.
|
||||||
|
+// This can be efficiently written as following:
|
||||||
|
+// v_cvt_pk_i16_i32 v0, v0, v1
|
||||||
|
+// v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
|
||||||
|
+void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
|
||||||
|
+ MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
|
||||||
|
+ MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
|
||||||
|
+
|
||||||
|
+ Register Src = MatchInfo.Origin;
|
||||||
|
+ assert(MRI.getType(Src) == LLT::scalar(64));
|
||||||
|
+ const LLT S32 = LLT::scalar(32);
|
||||||
|
+
|
||||||
|
+ B.setMBB(*MI.getParent());
|
||||||
|
+ B.setInstrAndDebugLoc(MI);
|
||||||
|
+
|
||||||
|
+ auto Unmerge = B.buildUnmerge(S32, Src);
|
||||||
|
+ Register Hi32 = Unmerge.getReg(0);
|
||||||
|
+ Register Lo32 = Unmerge.getReg(1);
|
||||||
|
+ MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass);
|
||||||
|
+ MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass);
|
||||||
|
+
|
||||||
|
+ constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64;
|
||||||
|
+ assert(MI.getOpcode() != CvtOpcode);
|
||||||
|
+
|
||||||
|
+ const auto REG_CLASS = &AMDGPU::VGPR_32RegClass;
|
||||||
|
+
|
||||||
|
+ Register CvtDst = MRI.createVirtualRegister(REG_CLASS);
|
||||||
|
+ MRI.setType(CvtDst, S32);
|
||||||
|
+
|
||||||
|
+ auto CvtPk = B.buildInstr(CvtOpcode);
|
||||||
|
+ CvtPk.addDef(CvtDst);
|
||||||
|
+ CvtPk.addReg(Hi32);
|
||||||
|
+ CvtPk.addReg(Lo32);
|
||||||
|
+ CvtPk.setMIFlags(MI.getFlags());
|
||||||
|
+
|
||||||
|
+ auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
|
||||||
|
+ auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
|
||||||
|
+
|
||||||
|
+ auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
|
||||||
|
+ MRI.setRegClass(MinBoundaryDst.getReg(0), REG_CLASS);
|
||||||
|
+
|
||||||
|
+ auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
|
||||||
|
+ MRI.setRegClass(MaxBoundaryDst.getReg(0), REG_CLASS);
|
||||||
|
+
|
||||||
|
+ Register MedDst = MRI.createVirtualRegister(REG_CLASS);
|
||||||
|
+ MRI.setType(MedDst, S32);
|
||||||
|
+
|
||||||
|
+ auto Med = B.buildInstr(AMDGPU::V_MED3_I32);
|
||||||
|
+ Med.addDef(MedDst);
|
||||||
|
+ Med.addReg(MinBoundaryDst.getReg(0));
|
||||||
|
+ Med.addReg(CvtDst);
|
||||||
|
+ Med.addReg(MaxBoundaryDst.getReg(0));
|
||||||
|
+ Med.setMIFlags(MI.getFlags());
|
||||||
|
+
|
||||||
|
+ Register TruncDst = MRI.createGenericVirtualRegister(LLT::scalar(16));
|
||||||
|
+ B.buildTrunc(TruncDst, MedDst);
|
||||||
|
+ B.buildCopy(MI.getOperand(0).getReg(), TruncDst);
|
||||||
|
+
|
||||||
|
+ MI.eraseFromParent();
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+class AMDGPUPreLegalizerCombinerHelperState {
|
||||||
|
+protected:
|
||||||
|
+ CombinerHelper &Helper;
|
||||||
|
+ AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
|
||||||
|
+
|
||||||
|
+public:
|
||||||
|
+ AMDGPUPreLegalizerCombinerHelperState(
|
||||||
|
+ CombinerHelper &Helper,
|
||||||
|
+ AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
|
||||||
|
+ : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
|
||||||
|
+};
|
||||||
|
+
|
||||||
|
#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
|
||||||
|
#include "AMDGPUGenPreLegalizeGICombiner.inc"
|
||||||
|
#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
|
||||||
|
@@ -45,9 +194,10 @@ public:
|
||||||
|
AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
|
||||||
|
|
||||||
|
AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
|
||||||
|
+ const AMDGPULegalizerInfo *LI,
|
||||||
|
GISelKnownBits *KB, MachineDominatorTree *MDT)
|
||||||
|
: CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
|
||||||
|
- /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
|
||||||
|
+ /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
|
||||||
|
KB(KB), MDT(MDT) {
|
||||||
|
if (!GeneratedRuleCfg.parseCommandLineOption())
|
||||||
|
report_fatal_error("Invalid rule identifier");
|
||||||
|
@@ -61,7 +211,9 @@ bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
|
||||||
|
MachineInstr &MI,
|
||||||
|
MachineIRBuilder &B) const {
|
||||||
|
CombinerHelper Helper(Observer, B, KB, MDT);
|
||||||
|
- AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg);
|
||||||
|
+ AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
|
||||||
|
+ AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
|
||||||
|
+ PreLegalizerHelper);
|
||||||
|
|
||||||
|
if (Generated.tryCombineAll(Observer, MI, B, Helper))
|
||||||
|
return true;
|
||||||
|
@@ -127,11 +279,16 @@ bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
|
||||||
|
const Function &F = MF.getFunction();
|
||||||
|
bool EnableOpt =
|
||||||
|
MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
|
||||||
|
+
|
||||||
|
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
||||||
|
+ const AMDGPULegalizerInfo *LI =
|
||||||
|
+ static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
|
||||||
|
+
|
||||||
|
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
|
||||||
|
MachineDominatorTree *MDT =
|
||||||
|
IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
|
||||||
|
AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
|
||||||
|
- F.hasMinSize(), KB, MDT);
|
||||||
|
+ F.hasMinSize(), LI, KB, MDT);
|
||||||
|
Combiner C(PCInfo, TPC);
|
||||||
|
return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
|
||||||
|
}
|
||||||
|
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
|
||||||
|
new file mode 100644
|
||||||
|
index 00000000000..e7d6634c29a
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
|
||||||
|
@@ -0,0 +1,112 @@
|
||||||
|
+; RUN: llc -global-isel -mcpu=tahiti -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX678,GFX6789 %s
|
||||||
|
+; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9,GFX6789 %s
|
||||||
|
+; RUN: llc -global-isel -mcpu=gfx1010 -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
|
||||||
|
+
|
||||||
|
+declare i64 @llvm.smax.i64(i64, i64)
|
||||||
|
+declare i64 @llvm.smin.i64(i64, i64)
|
||||||
|
+
|
||||||
|
+; GFX10-LABEL: {{^}}v_clamp_i64_i16
|
||||||
|
+; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
+; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
+; GFX6789: v_mov_b32_e32 [[B]], 0x7fff
|
||||||
|
+; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffff8000
|
||||||
|
+; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]]
|
||||||
|
+; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
+; GFX10: v_mov_b32_e32 [[C:v[0-9]+]], 0xffff8000
|
||||||
|
+; GFX10: v_med3_i32 [[A]], [[C]], [[A]], 0x7fff
|
||||||
|
+define i16 @v_clamp_i64_i16(i64 %in) #0 {
|
||||||
|
+entry:
|
||||||
|
+ %max = call i64 @llvm.smax.i64(i64 %in, i64 -32768)
|
||||||
|
+ %min = call i64 @llvm.smin.i64(i64 %max, i64 32767)
|
||||||
|
+ %result = trunc i64 %min to i16
|
||||||
|
+ ret i16 %result
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+; GFX10-LABEL: {{^}}v_clamp_i64_i16_reverse
|
||||||
|
+; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
+; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
+; GFX6789: v_mov_b32_e32 [[B]], 0x7fff
|
||||||
|
+; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffff8000
|
||||||
|
+; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]]
|
||||||
|
+; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
+; GFX10: v_mov_b32_e32 [[C:v[0-9]+]], 0xffff8000
|
||||||
|
+; GFX10: v_med3_i32 [[A]], [[C]], [[A]], 0x7fff
|
||||||
|
+define i16 @v_clamp_i64_i16_reverse(i64 %in) #0 {
|
||||||
|
+entry:
|
||||||
|
+ %min = call i64 @llvm.smin.i64(i64 %in, i64 32767)
|
||||||
|
+ %max = call i64 @llvm.smax.i64(i64 %min, i64 -32768)
|
||||||
|
+ %result = trunc i64 %max to i16
|
||||||
|
+ ret i16 %result
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+; GFX10-LABEL: {{^}}v_clamp_i64_i16_invalid_lower
|
||||||
|
+; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8001
|
||||||
|
+; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc
|
||||||
|
+; GFX6789: v_cndmask_b32_e32 [[C:v[0-9]+]], 0, [[C]], vcc
|
||||||
|
+
|
||||||
|
+; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8001, [[A]], vcc_lo
|
||||||
|
+; GFX10: v_cndmask_b32_e32 [[B:v[0-9]+]], 0, [[B]], vcc_lo
|
||||||
|
+define i16 @v_clamp_i64_i16_invalid_lower(i64 %in) #0 {
|
||||||
|
+entry:
|
||||||
|
+ %min = call i64 @llvm.smin.i64(i64 %in, i64 32769)
|
||||||
|
+ %max = call i64 @llvm.smax.i64(i64 %min, i64 -32768)
|
||||||
|
+ %result = trunc i64 %max to i16
|
||||||
|
+ ret i16 %result
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+; GFX10-LABEL: {{^}}v_clamp_i64_i16_invalid_lower_and_higher
|
||||||
|
+; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8000
|
||||||
|
+; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc
|
||||||
|
+; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8000, [[A]], vcc_lo
|
||||||
|
+define i16 @v_clamp_i64_i16_invalid_lower_and_higher(i64 %in) #0 {
|
||||||
|
+entry:
|
||||||
|
+ %max = call i64 @llvm.smax.i64(i64 %in, i64 -32769)
|
||||||
|
+ %min = call i64 @llvm.smin.i64(i64 %max, i64 32768)
|
||||||
|
+ %result = trunc i64 %min to i16
|
||||||
|
+ ret i16 %result
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+; GFX10-LABEL: {{^}}v_clamp_i64_i16_lower_than_short
|
||||||
|
+; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
+; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
+; GFX6789: v_mov_b32_e32 [[B]], 0x100
|
||||||
|
+; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffffff01
|
||||||
|
+; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]]
|
||||||
|
+; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
+; GFX10: v_mov_b32_e32 [[C:v[0-9]+]], 0xffffff01
|
||||||
|
+; GFX10: v_med3_i32 [[A]], [[C]], [[A]], 0x100
|
||||||
|
+define i16 @v_clamp_i64_i16_lower_than_short(i64 %in) #0 {
|
||||||
|
+entry:
|
||||||
|
+ %min = call i64 @llvm.smin.i64(i64 %in, i64 256)
|
||||||
|
+ %max = call i64 @llvm.smax.i64(i64 %min, i64 -255)
|
||||||
|
+ %result = trunc i64 %max to i16
|
||||||
|
+ ret i16 %result
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+; GFX10-LABEL: {{^}}v_clamp_i64_i16_lower_than_short_reverse
|
||||||
|
+; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
+; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
+; GFX6789: v_mov_b32_e32 [[B]], 0x100
|
||||||
|
+; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffffff01
|
||||||
|
+; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]]
|
||||||
|
+; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
+; GFX10: v_mov_b32_e32 [[C:v[0-9]+]], 0xffffff01
|
||||||
|
+; GFX10: v_med3_i32 [[A]], [[C]], [[A]], 0x100
|
||||||
|
+define i16 @v_clamp_i64_i16_lower_than_short_reverse(i64 %in) #0 {
|
||||||
|
+entry:
|
||||||
|
+ %max = call i64 @llvm.smax.i64(i64 %in, i64 -255)
|
||||||
|
+ %min = call i64 @llvm.smin.i64(i64 %max, i64 256)
|
||||||
|
+ %result = trunc i64 %min to i16
|
||||||
|
+ ret i16 %result
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+; GFX10-LABEL: {{^}}v_clamp_i64_i16_zero
|
||||||
|
+; GFX6789: v_mov_b32_e32 v0, 0
|
||||||
|
+; GFX10: v_mov_b32_e32 v0, 0
|
||||||
|
+define i16 @v_clamp_i64_i16_zero(i64 %in) #0 {
|
||||||
|
+entry:
|
||||||
|
+ %max = call i64 @llvm.smax.i64(i64 %in, i64 0)
|
||||||
|
+ %min = call i64 @llvm.smin.i64(i64 %max, i64 0)
|
||||||
|
+ %result = trunc i64 %min to i16
|
||||||
|
+ ret i16 %result
|
||||||
|
+}
|
||||||
|
\ No newline at end of file
|
||||||
|
|
@ -472,7 +472,6 @@ struct TernaryOp_match {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename Src0Ty, typename Src1Ty, typename Src2Ty>
|
template <typename Src0Ty, typename Src1Ty, typename Src2Ty>
|
||||||
inline TernaryOp_match<Src0Ty, Src1Ty, Src2Ty,
|
inline TernaryOp_match<Src0Ty, Src1Ty, Src2Ty,
|
||||||
TargetOpcode::G_INSERT_VECTOR_ELT>
|
TargetOpcode::G_INSERT_VECTOR_ELT>
|
||||||
|
|
|
||||||
|
|
@ -197,11 +197,11 @@ void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
|
||||||
SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
|
SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
|
||||||
|
|
||||||
if (Ty == S32) {
|
if (Ty == S32) {
|
||||||
B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},
|
B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
|
||||||
MI.getFlags());
|
{SrcReg}, MI.getFlags());
|
||||||
} else {
|
} else {
|
||||||
auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg},
|
auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
|
||||||
MI.getFlags());
|
{SrcReg}, MI.getFlags());
|
||||||
B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
|
B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -254,7 +254,6 @@ void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
|
||||||
MI.eraseFromParent();
|
MI.eraseFromParent();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class AMDGPUPostLegalizerCombinerHelperState {
|
class AMDGPUPostLegalizerCombinerHelperState {
|
||||||
protected:
|
protected:
|
||||||
CombinerHelper &Helper;
|
CombinerHelper &Helper;
|
||||||
|
|
|
||||||
|
|
@ -91,7 +91,8 @@ bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
|
||||||
const auto Cmp2 = MatchInfo.Cmp2;
|
const auto Cmp2 = MatchInfo.Cmp2;
|
||||||
const auto Diff = std::abs(Cmp2 - Cmp1);
|
const auto Diff = std::abs(Cmp2 - Cmp1);
|
||||||
|
|
||||||
// If the difference between both comparison values is 0 or 1, there is no need to clamp.
|
// If the difference between both comparison values is 0 or 1, there is no
|
||||||
|
// need to clamp.
|
||||||
if (Diff == 0 || Diff == 1)
|
if (Diff == 0 || Diff == 1)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue