307 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			307 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C++
		
	
	
	
| //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
 | |
| //
 | |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 | |
| // See https://llvm.org/LICENSE.txt for license information.
 | |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 | |
| //
 | |
| //===----------------------------------------------------------------------===//
 | |
| 
 | |
| #include "CSPreInliner.h"
 | |
| #include "ProfiledBinary.h"
 | |
| #include "llvm/ADT/SCCIterator.h"
 | |
| #include "llvm/ADT/Statistic.h"
 | |
| #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
 | |
| #include <cstdint>
 | |
| #include <queue>
 | |
| 
 | |
| #define DEBUG_TYPE "cs-preinliner"
 | |
| 
 | |
| using namespace llvm;
 | |
| using namespace sampleprof;
 | |
| 
 | |
| STATISTIC(PreInlNumCSInlined,
 | |
|           "Number of functions inlined with context sensitive profile");
 | |
| STATISTIC(PreInlNumCSNotInlined,
 | |
|           "Number of functions not inlined with context sensitive profile");
 | |
| STATISTIC(PreInlNumCSInlinedHitMinLimit,
 | |
|           "Number of functions with FDO inline stopped due to min size limit");
 | |
| STATISTIC(PreInlNumCSInlinedHitMaxLimit,
 | |
|           "Number of functions with FDO inline stopped due to max size limit");
 | |
| STATISTIC(
 | |
|     PreInlNumCSInlinedHitGrowthLimit,
 | |
|     "Number of functions with FDO inline stopped due to growth size limit");
 | |
| 
 | |
| // The switches specify inline thresholds used in SampleProfileLoader inlining.
 | |
| // TODO: the actual threshold to be tuned here because the size here is based
 | |
| // on machine code not LLVM IR.
 | |
| extern cl::opt<int> SampleHotCallSiteThreshold;
 | |
| extern cl::opt<int> SampleColdCallSiteThreshold;
 | |
| extern cl::opt<int> ProfileInlineGrowthLimit;
 | |
| extern cl::opt<int> ProfileInlineLimitMin;
 | |
| extern cl::opt<int> ProfileInlineLimitMax;
 | |
| extern cl::opt<bool> SortProfiledSCC;
 | |
| 
 | |
| cl::opt<bool> EnableCSPreInliner(
 | |
|     "csspgo-preinliner", cl::Hidden, cl::init(true),
 | |
|     cl::desc("Run a global pre-inliner to merge context profile based on "
 | |
|              "estimated global top-down inline decisions"));
 | |
| 
 | |
| cl::opt<bool> UseContextCostForPreInliner(
 | |
|     "use-context-cost-for-preinliner", cl::Hidden, cl::init(true),
 | |
|     cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
 | |
| 
 | |
| static cl::opt<bool> SamplePreInlineReplay(
 | |
|     "csspgo-replay-preinline", cl::Hidden, cl::init(false),
 | |
|     cl::desc(
 | |
|         "Replay previous inlining and adjust context profile accordingly"));
 | |
| 
 | |
| CSPreInliner::CSPreInliner(SampleProfileMap &Profiles, ProfiledBinary &Binary,
 | |
|                            ProfileSummary *Summary)
 | |
|     : UseContextCost(UseContextCostForPreInliner),
 | |
|       // TODO: Pass in a guid-to-name map in order for
 | |
|       // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
 | |
|       // as their profile context.
 | |
|       ContextTracker(Profiles, nullptr), ProfileMap(Profiles), Binary(Binary),
 | |
|       Summary(Summary) {
 | |
|   // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
 | |
|   // for good performance with reasonable profile size.
 | |
|   if (!SampleHotCallSiteThreshold.getNumOccurrences())
 | |
|     SampleHotCallSiteThreshold = 1500;
 | |
|   if (!SampleColdCallSiteThreshold.getNumOccurrences())
 | |
|     SampleColdCallSiteThreshold = 0;
 | |
|   if (!ProfileInlineLimitMax.getNumOccurrences())
 | |
|     ProfileInlineLimitMax = 3000;
 | |
| }
 | |
| 
 | |
| std::vector<StringRef> CSPreInliner::buildTopDownOrder() {
 | |
|   std::vector<StringRef> Order;
 | |
|   ProfiledCallGraph ProfiledCG(ContextTracker);
 | |
| 
 | |
|   // Now that we have a profiled call graph, construct top-down order
 | |
|   // by building up SCC and reversing SCC order.
 | |
|   scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
 | |
|   while (!I.isAtEnd()) {
 | |
|     auto Range = *I;
 | |
|     if (SortProfiledSCC) {
 | |
|       // Sort nodes in one SCC based on callsite hotness.
 | |
|       scc_member_iterator<ProfiledCallGraph *> SI(*I);
 | |
|       Range = *SI;
 | |
|     }
 | |
|     for (auto *Node : Range) {
 | |
|       if (Node != ProfiledCG.getEntryNode())
 | |
|         Order.push_back(Node->Name);
 | |
|     }
 | |
|     ++I;
 | |
|   }
 | |
|   std::reverse(Order.begin(), Order.end());
 | |
| 
 | |
|   return Order;
 | |
| }
 | |
| 
 | |
| bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
 | |
|                                        const FunctionSamples *CallerSamples) {
 | |
|   assert(CallerSamples && "Expect non-null caller samples");
 | |
| 
 | |
|   // Ideally we want to consider everything a function calls, but as far as
 | |
|   // context profile is concerned, only those frames that are children of
 | |
|   // current one in the trie is relavent. So we walk the trie instead of call
 | |
|   // targets from function profile.
 | |
|   ContextTrieNode *CallerNode =
 | |
|       ContextTracker.getContextFor(CallerSamples->getContext());
 | |
| 
 | |
|   bool HasNewCandidate = false;
 | |
|   for (auto &Child : CallerNode->getAllChildContext()) {
 | |
|     ContextTrieNode *CalleeNode = &Child.second;
 | |
|     FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
 | |
|     if (!CalleeSamples)
 | |
|       continue;
 | |
| 
 | |
|     // Call site count is more reliable, so we look up the corresponding call
 | |
|     // target profile in caller's context profile to retrieve call site count.
 | |
|     uint64_t CalleeEntryCount = CalleeSamples->getEntrySamples();
 | |
|     uint64_t CallsiteCount = 0;
 | |
|     LineLocation Callsite = CalleeNode->getCallSiteLoc();
 | |
|     if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
 | |
|       SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
 | |
|       auto It = TargetCounts.find(CalleeSamples->getName());
 | |
|       if (It != TargetCounts.end())
 | |
|         CallsiteCount = It->second;
 | |
|     }
 | |
| 
 | |
|     // TODO: call site and callee entry count should be mostly consistent, add
 | |
|     // check for that.
 | |
|     HasNewCandidate = true;
 | |
|     uint32_t CalleeSize = getFuncSize(*CalleeSamples);
 | |
|     CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount),
 | |
|                    CalleeSize);
 | |
|   }
 | |
| 
 | |
|   return HasNewCandidate;
 | |
| }
 | |
| 
 | |
| uint32_t CSPreInliner::getFuncSize(const FunctionSamples &FSamples) {
 | |
|   if (UseContextCost) {
 | |
|     return Binary.getFuncSizeForContext(FSamples.getContext());
 | |
|   }
 | |
| 
 | |
|   return FSamples.getBodySamples().size();
 | |
| }
 | |
| 
 | |
| bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
 | |
|   // If replay inline is requested, simply follow the inline decision of the
 | |
|   // profiled binary.
 | |
|   if (SamplePreInlineReplay)
 | |
|     return Candidate.CalleeSamples->getContext().hasAttribute(
 | |
|         ContextWasInlined);
 | |
| 
 | |
|   unsigned int SampleThreshold = SampleColdCallSiteThreshold;
 | |
|   uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
 | |
|       (Summary->getDetailedSummary()));
 | |
| 
 | |
|   if (Candidate.CallsiteCount <= ColdCountThreshold)
 | |
|     SampleThreshold = SampleColdCallSiteThreshold;
 | |
|   else {
 | |
|     // Linearly adjust threshold based on normalized hotness, i.e, a value in
 | |
|     // [0,1]. Use 10% cutoff instead of the max count as the normalization
 | |
|     // upperbound for stability.
 | |
|     double NormalizationUpperBound =
 | |
|         ProfileSummaryBuilder::getEntryForPercentile(
 | |
|             Summary->getDetailedSummary(), 100000 /* 10% */)
 | |
|             .MinCount;
 | |
|     double NormalizationLowerBound = ColdCountThreshold;
 | |
|     double NormalizedHotness =
 | |
|         (Candidate.CallsiteCount - NormalizationLowerBound) /
 | |
|         (NormalizationUpperBound - NormalizationLowerBound);
 | |
|     if (NormalizedHotness > 1.0)
 | |
|       NormalizedHotness = 1.0;
 | |
|     // Add 1 to to ensure hot callsites get a non-zero threshold, which could
 | |
|     // happen when SampleColdCallSiteThreshold is 0. This is when we do not
 | |
|     // want any inlining for cold callsites.
 | |
|     SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 +
 | |
|                       SampleColdCallSiteThreshold + 1;
 | |
|   }
 | |
| 
 | |
|   return (Candidate.SizeCost < SampleThreshold);
 | |
| }
 | |
| 
 | |
| void CSPreInliner::processFunction(const StringRef Name) {
 | |
|   FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
 | |
|   if (!FSamples)
 | |
|     return;
 | |
| 
 | |
|   unsigned FuncSize = getFuncSize(*FSamples);
 | |
|   unsigned FuncFinalSize = FuncSize;
 | |
|   unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
 | |
|   SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
 | |
|   SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
 | |
| 
 | |
|   LLVM_DEBUG(dbgs() << "Process " << Name
 | |
|                     << " for context-sensitive pre-inlining (pre-inline size: "
 | |
|                     << FuncSize << ", size limit: " << SizeLimit << ")\n");
 | |
| 
 | |
|   ProfiledCandidateQueue CQueue;
 | |
|   getInlineCandidates(CQueue, FSamples);
 | |
| 
 | |
|   while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
 | |
|     ProfiledInlineCandidate Candidate = CQueue.top();
 | |
|     CQueue.pop();
 | |
|     bool ShouldInline = false;
 | |
|     if ((ShouldInline = shouldInline(Candidate))) {
 | |
|       // We mark context as inlined as the corresponding context profile
 | |
|       // won't be merged into that function's base profile.
 | |
|       ++PreInlNumCSInlined;
 | |
|       ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
 | |
|       Candidate.CalleeSamples->getContext().setAttribute(
 | |
|           ContextShouldBeInlined);
 | |
|       FuncFinalSize += Candidate.SizeCost;
 | |
|       getInlineCandidates(CQueue, Candidate.CalleeSamples);
 | |
|     } else {
 | |
|       ++PreInlNumCSNotInlined;
 | |
|     }
 | |
|     LLVM_DEBUG(dbgs() << (ShouldInline ? "  Inlined" : "  Outlined")
 | |
|                       << " context profile for: "
 | |
|                       << Candidate.CalleeSamples->getContext().toString()
 | |
|                       << " (callee size: " << Candidate.SizeCost
 | |
|                       << ", call count:" << Candidate.CallsiteCount << ")\n");
 | |
|   }
 | |
| 
 | |
|   if (!CQueue.empty()) {
 | |
|     if (SizeLimit == (unsigned)ProfileInlineLimitMax)
 | |
|       ++PreInlNumCSInlinedHitMaxLimit;
 | |
|     else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
 | |
|       ++PreInlNumCSInlinedHitMinLimit;
 | |
|     else
 | |
|       ++PreInlNumCSInlinedHitGrowthLimit;
 | |
|   }
 | |
| 
 | |
|   LLVM_DEBUG({
 | |
|     if (!CQueue.empty())
 | |
|       dbgs() << "  Inline candidates ignored due to size limit (inliner "
 | |
|                 "original size: "
 | |
|              << FuncSize << ", inliner final size: " << FuncFinalSize
 | |
|              << ", size limit: " << SizeLimit << ")\n";
 | |
| 
 | |
|     while (!CQueue.empty()) {
 | |
|       ProfiledInlineCandidate Candidate = CQueue.top();
 | |
|       CQueue.pop();
 | |
|       bool WasInlined =
 | |
|           Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
 | |
|       dbgs() << "    " << Candidate.CalleeSamples->getContext().toString()
 | |
|              << " (candidate size:" << Candidate.SizeCost
 | |
|              << ", call count: " << Candidate.CallsiteCount << ", previously "
 | |
|              << (WasInlined ? "inlined)\n" : "not inlined)\n");
 | |
|     }
 | |
|   });
 | |
| }
 | |
| 
 | |
| void CSPreInliner::run() {
 | |
| #ifndef NDEBUG
 | |
|   auto printProfileNames = [](SampleProfileMap &Profiles, bool IsInput) {
 | |
|     dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
 | |
|            << Profiles.size() << " total):\n";
 | |
|     for (auto &It : Profiles) {
 | |
|       const FunctionSamples &Samples = It.second;
 | |
|       dbgs() << "  [" << Samples.getContext().toString() << "] "
 | |
|              << Samples.getTotalSamples() << ":" << Samples.getHeadSamples()
 | |
|              << "\n";
 | |
|     }
 | |
|   };
 | |
| #endif
 | |
| 
 | |
|   LLVM_DEBUG(printProfileNames(ProfileMap, true));
 | |
| 
 | |
|   // Execute global pre-inliner to estimate a global top-down inline
 | |
|   // decision and merge profiles accordingly. This helps with profile
 | |
|   // merge for ThinLTO otherwise we won't be able to merge profiles back
 | |
|   // to base profile across module/thin-backend boundaries.
 | |
|   // It also helps better compress context profile to control profile
 | |
|   // size, as we now only need context profile for functions going to
 | |
|   // be inlined.
 | |
|   for (StringRef FuncName : buildTopDownOrder()) {
 | |
|     processFunction(FuncName);
 | |
|   }
 | |
| 
 | |
|   // Not inlined context profiles are merged into its base, so we can
 | |
|   // trim out such profiles from the output.
 | |
|   std::vector<SampleContext> ProfilesToBeRemoved;
 | |
|   for (auto &It : ProfileMap) {
 | |
|     SampleContext &Context = It.second.getContext();
 | |
|     if (!Context.isBaseContext() && !Context.hasState(InlinedContext)) {
 | |
|       assert(Context.hasState(MergedContext) &&
 | |
|              "Not inlined context profile should be merged already");
 | |
|       ProfilesToBeRemoved.push_back(It.first);
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   for (auto &ContextName : ProfilesToBeRemoved) {
 | |
|     ProfileMap.erase(ContextName);
 | |
|   }
 | |
| 
 | |
|   // Make sure ProfileMap's key is consistent with FunctionSamples' name.
 | |
|   SampleContextTrimmer(ProfileMap).canonicalizeContextProfiles();
 | |
| 
 | |
|   FunctionSamples::ProfileIsPreInlined = true;
 | |
| 
 | |
|   LLVM_DEBUG(printProfileNames(ProfileMap, false));
 | |
| }
 |