forked from OSchip/llvm-project
				
			
		
			
				
	
	
		
			344 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			344 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C++
		
	
	
	
| //===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===//
 | |
| //
 | |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 | |
| // See https://llvm.org/LICENSE.txt for license information.
 | |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 | |
| //
 | |
| //===----------------------------------------------------------------------===//
 | |
| /// \file
 | |
| ///
 | |
| /// This file implements the bottleneck analysis view.
 | |
| ///
 | |
| /// This view internally observes backend pressure increase events in order to
 | |
| /// identify problematic data dependencies and processor resource interferences.
 | |
| ///
 | |
| /// Example of bottleneck analysis report for a dot-product on X86 btver2:
 | |
| ///
 | |
| /// Cycles with backend pressure increase [ 40.76% ]
 | |
| /// Throughput Bottlenecks: 
 | |
| ///   Resource Pressure       [ 39.34% ]
 | |
| ///   - JFPA  [ 39.34% ]
 | |
| ///   - JFPU0  [ 39.34% ]
 | |
| ///   Data Dependencies:      [ 1.42% ]
 | |
| ///   - Register Dependencies [ 1.42% ]
 | |
| ///   - Memory Dependencies   [ 0.00% ]
 | |
| ///
 | |
| /// According to the example, backend pressure increased during the 40.76% of
 | |
| /// the simulated cycles.  In particular, the major cause of backend pressure
 | |
| /// increases was the contention on floating point adder JFPA accessible from
 | |
| /// pipeline resource JFPU0.
 | |
| ///
 | |
| /// At the end of each cycle, if pressure on the simulated out-of-order buffers
 | |
| /// has increased, a backend pressure event is reported.
 | |
| /// In particular, this occurs when there is a delta between the number of uOps
 | |
| /// dispatched and the number of uOps issued to the underlying pipelines.
 | |
| ///
 | |
| /// The bottleneck analysis view is also responsible for identifying and printing
 | |
| /// the most "critical" sequence of dependent instructions according to the
 | |
| /// simulated run.
 | |
| ///
 | |
| /// Below is the critical sequence computed for the dot-product example on
 | |
| /// btver2:
 | |
| ///
 | |
| ///              Instruction                     Dependency Information
 | |
| /// +----< 2.    vhaddps %xmm3, %xmm3, %xmm4
 | |
| /// |
 | |
| /// |    < loop carried > 
 | |
| /// |
 | |
| /// |      0.    vmulps	 %xmm0, %xmm0, %xmm2
 | |
| /// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
 | |
| /// +----> 2.    vhaddps %xmm3, %xmm3, %xmm4     ## REGISTER dependency:  %xmm3
 | |
| /// |
 | |
| /// |    < loop carried > 
 | |
| /// |
 | |
| /// +----> 1.    vhaddps %xmm2, %xmm2, %xmm3     ## RESOURCE interference:  JFPA [ probability: 73% ]
 | |
| ///
 | |
| ///
 | |
| /// The algorithm that computes the critical sequence is very similar to a
 | |
| /// critical path analysis.
 | |
| /// 
 | |
| /// A dependency graph is used internally to track dependencies between nodes.
 | |
| /// Nodes of the graph represent instructions from the input assembly sequence,
 | |
| /// and edges of the graph represent data dependencies or processor resource
 | |
| /// interferences.
 | |
| ///
 | |
| /// Edges are dynamically 'discovered' by observing instruction state transitions
 | |
| /// and backend pressure increase events. Edges are internally ranked based on
 | |
| /// their "criticality". A dependency is considered to be critical if it takes a
 | |
| /// long time to execute, and if it contributes to backend pressure increases.
 | |
| /// Criticality is internally measured in terms of cycles; it is computed for
 | |
| /// every edge in the graph as a function of the edge latency and the number of
 | |
| /// backend pressure increase cycles contributed by that edge.
 | |
| ///
 | |
| /// At the end of simulation, costs are propagated to nodes through the edges of
 | |
| /// the graph, and the most expensive path connecting the root-set (a
 | |
| /// set of nodes with no predecessors) to a leaf node is reported as critical
 | |
| /// sequence.
 | |
| //
 | |
| //===----------------------------------------------------------------------===//
 | |
| 
 | |
| #ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
 | |
| #define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
 | |
| 
 | |
| #include "Views/View.h"
 | |
| #include "llvm/ADT/DenseMap.h"
 | |
| #include "llvm/ADT/SmallVector.h"
 | |
| #include "llvm/MC/MCInstPrinter.h"
 | |
| #include "llvm/MC/MCSchedule.h"
 | |
| #include "llvm/MC/MCSubtargetInfo.h"
 | |
| #include "llvm/Support/raw_ostream.h"
 | |
| 
 | |
| namespace llvm {
 | |
| namespace mca {
 | |
| 
 | |
| class PressureTracker {
 | |
|   const MCSchedModel &SM;
 | |
| 
 | |
|   // Resource pressure distribution. There is an element for every processor
 | |
|   // resource declared by the scheduling model. Quantities are number of cycles.
 | |
|   SmallVector<unsigned, 4> ResourcePressureDistribution;
 | |
| 
 | |
|   // Each processor resource is associated with a so-called processor resource
 | |
|   // mask. This vector allows to correlate processor resource IDs with processor
 | |
|   // resource masks. There is exactly one element per each processor resource
 | |
|   // declared by the scheduling model.
 | |
|   SmallVector<uint64_t, 4> ProcResID2Mask;
 | |
| 
 | |
|   // Maps processor resource state indices (returned by calls to
 | |
|   // `getResourceStateIndex(Mask)` to processor resource identifiers.
 | |
|   SmallVector<unsigned, 4> ResIdx2ProcResID;
 | |
| 
 | |
|   // Maps Processor Resource identifiers to ResourceUsers indices.
 | |
|   SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex;
 | |
| 
 | |
|   // Identifies the last user of a processor resource unit.
 | |
|   // This vector is updated on every instruction issued event.
 | |
|   // There is one entry for every processor resource unit declared by the
 | |
|   // processor model. An all_ones value is treated like an invalid instruction
 | |
|   // identifier.
 | |
|   using User = std::pair<unsigned, unsigned>;
 | |
|   SmallVector<User, 4> ResourceUsers;
 | |
| 
 | |
|   struct InstructionPressureInfo {
 | |
|     unsigned RegisterPressureCycles;
 | |
|     unsigned MemoryPressureCycles;
 | |
|     unsigned ResourcePressureCycles;
 | |
|   };
 | |
|   DenseMap<unsigned, InstructionPressureInfo> IPI;
 | |
| 
 | |
|   void updateResourcePressureDistribution(uint64_t CumulativeMask);
 | |
| 
 | |
|   User getResourceUser(unsigned ProcResID, unsigned UnitID) const {
 | |
|     unsigned Index = ProcResID2ResourceUsersIndex[ProcResID];
 | |
|     return ResourceUsers[Index + UnitID];
 | |
|   }
 | |
| 
 | |
| public:
 | |
|   PressureTracker(const MCSchedModel &Model);
 | |
| 
 | |
|   ArrayRef<unsigned> getResourcePressureDistribution() const {
 | |
|     return ResourcePressureDistribution;
 | |
|   }
 | |
| 
 | |
|   void getResourceUsers(uint64_t ResourceMask,
 | |
|                         SmallVectorImpl<User> &Users) const;
 | |
| 
 | |
|   unsigned getRegisterPressureCycles(unsigned IID) const {
 | |
|     assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
 | |
|     const InstructionPressureInfo &Info = IPI.find(IID)->second;
 | |
|     return Info.RegisterPressureCycles;
 | |
|   }
 | |
| 
 | |
|   unsigned getMemoryPressureCycles(unsigned IID) const {
 | |
|     assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
 | |
|     const InstructionPressureInfo &Info = IPI.find(IID)->second;
 | |
|     return Info.MemoryPressureCycles;
 | |
|   }
 | |
| 
 | |
|   unsigned getResourcePressureCycles(unsigned IID) const {
 | |
|     assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
 | |
|     const InstructionPressureInfo &Info = IPI.find(IID)->second;
 | |
|     return Info.ResourcePressureCycles;
 | |
|   }
 | |
| 
 | |
|   const char *resolveResourceName(uint64_t ResourceMask) const {
 | |
|     unsigned Index = getResourceStateIndex(ResourceMask);
 | |
|     unsigned ProcResID = ResIdx2ProcResID[Index];
 | |
|     const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);
 | |
|     return PRDesc.Name;
 | |
|   }
 | |
| 
 | |
|   void onInstructionDispatched(unsigned IID);
 | |
|   void onInstructionExecuted(unsigned IID);
 | |
| 
 | |
|   void handlePressureEvent(const HWPressureEvent &Event);
 | |
|   void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event);
 | |
| };
 | |
| 
 | |
| // A dependency edge.
 | |
| struct DependencyEdge {
 | |
|   enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE };
 | |
| 
 | |
|   // Dependency edge descriptor.
 | |
|   //
 | |
|   // It specifies the dependency type, as well as the edge cost in cycles.
 | |
|   struct Dependency {
 | |
|     DependencyType Type;
 | |
|     uint64_t ResourceOrRegID;
 | |
|     uint64_t Cost;
 | |
|   };
 | |
|   Dependency Dep;
 | |
| 
 | |
|   unsigned FromIID;
 | |
|   unsigned ToIID;
 | |
| 
 | |
|   // Used by the bottleneck analysis to compute the interference
 | |
|   // probability for processor resources.
 | |
|   unsigned Frequency;
 | |
| };
 | |
| 
 | |
| // A dependency graph used by the bottleneck analysis to describe data
 | |
| // dependencies and processor resource interferences between instructions.
 | |
| //
 | |
| // There is a node (an instance of struct DGNode) for every instruction in the
 | |
| // input assembly sequence. Edges of the graph represent dependencies between
 | |
| // instructions.
 | |
| //
 | |
| // Each edge of the graph is associated with a cost value which is used
 | |
| // internally to rank dependency based on their impact on the runtime
 | |
| // performance (see field DependencyEdge::Dependency::Cost). In general, the
 | |
| // higher the cost of an edge, the higher the impact on performance.
 | |
| //
 | |
| // The cost of a dependency is a function of both the latency and the number of
 | |
| // cycles where the dependency has been seen as critical (i.e. contributing to
 | |
| // back-pressure increases).
 | |
| //
 | |
| // Loop carried dependencies are carefully expanded by the bottleneck analysis
 | |
| // to guarantee that the graph stays acyclic. To this end, extra nodes are
 | |
| // pre-allocated at construction time to describe instructions from "past and
 | |
| // future" iterations. The graph is kept acyclic mainly because it simplifies the
 | |
| // complexity of the algorithm that computes the critical sequence.
 | |
| class DependencyGraph {
 | |
|   struct DGNode {
 | |
|     unsigned NumPredecessors;
 | |
|     unsigned NumVisitedPredecessors;
 | |
|     uint64_t Cost;
 | |
|     unsigned Depth;
 | |
| 
 | |
|     DependencyEdge CriticalPredecessor;
 | |
|     SmallVector<DependencyEdge, 8> OutgoingEdges;
 | |
|   };
 | |
|   SmallVector<DGNode, 16> Nodes;
 | |
| 
 | |
|   DependencyGraph(const DependencyGraph &) = delete;
 | |
|   DependencyGraph &operator=(const DependencyGraph &) = delete;
 | |
| 
 | |
|   void addDependency(unsigned From, unsigned To,
 | |
|                      DependencyEdge::Dependency &&DE);
 | |
| 
 | |
|   void pruneEdges(unsigned Iterations);
 | |
|   void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
 | |
|   void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, unsigned Iterations);
 | |
| 
 | |
| #ifndef NDEBUG
 | |
|   void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
 | |
|                           MCInstPrinter &MCIP) const;
 | |
| #endif
 | |
| 
 | |
| public:
 | |
|   DependencyGraph(unsigned Size) : Nodes(Size) {}
 | |
| 
 | |
|   void addRegisterDep(unsigned From, unsigned To, unsigned RegID,
 | |
|                       unsigned Cost) {
 | |
|     addDependency(From, To, {DependencyEdge::DT_REGISTER, RegID, Cost});
 | |
|   }
 | |
| 
 | |
|   void addMemoryDep(unsigned From, unsigned To, unsigned Cost) {
 | |
|     addDependency(From, To, {DependencyEdge::DT_MEMORY, /* unused */ 0, Cost});
 | |
|   }
 | |
| 
 | |
|   void addResourceDep(unsigned From, unsigned To, uint64_t Mask,
 | |
|                       unsigned Cost) {
 | |
|     addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost});
 | |
|   }
 | |
| 
 | |
|   // Called by the bottleneck analysis at the end of simulation to propagate
 | |
|   // costs through the edges of the graph, and compute a critical path.
 | |
|   void finalizeGraph(unsigned Iterations) {
 | |
|     SmallVector<unsigned, 16> RootSet;
 | |
|     pruneEdges(Iterations);
 | |
|     initializeRootSet(RootSet);
 | |
|     propagateThroughEdges(RootSet, Iterations);
 | |
|   }
 | |
| 
 | |
|   // Returns a sequence of edges representing the critical sequence based on the
 | |
|   // simulated run. It assumes that the graph has already been finalized (i.e.
 | |
|   // method `finalizeGraph()` has already been called on this graph).
 | |
|   void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const;
 | |
| 
 | |
| #ifndef NDEBUG
 | |
|   void dump(raw_ostream &OS, MCInstPrinter &MCIP) const;
 | |
| #endif
 | |
| };
 | |
| 
 | |
| /// A view that collects and prints a few performance numbers.
 | |
| class BottleneckAnalysis : public View {
 | |
|   const MCSubtargetInfo &STI;
 | |
|   MCInstPrinter &MCIP;
 | |
|   PressureTracker Tracker;
 | |
|   DependencyGraph DG;
 | |
| 
 | |
|   ArrayRef<MCInst> Source;
 | |
|   unsigned Iterations;
 | |
|   unsigned TotalCycles;
 | |
| 
 | |
|   bool PressureIncreasedBecauseOfResources;
 | |
|   bool PressureIncreasedBecauseOfRegisterDependencies;
 | |
|   bool PressureIncreasedBecauseOfMemoryDependencies;
 | |
|   // True if throughput was affected by dispatch stalls.
 | |
|   bool SeenStallCycles;
 | |
| 
 | |
|   struct BackPressureInfo {
 | |
|     // Cycles where backpressure increased.
 | |
|     unsigned PressureIncreaseCycles;
 | |
|     // Cycles where backpressure increased because of pipeline pressure.
 | |
|     unsigned ResourcePressureCycles;
 | |
|     // Cycles where backpressure increased because of data dependencies.
 | |
|     unsigned DataDependencyCycles;
 | |
|     // Cycles where backpressure increased because of register dependencies.
 | |
|     unsigned RegisterDependencyCycles;
 | |
|     // Cycles where backpressure increased because of memory dependencies.
 | |
|     unsigned MemoryDependencyCycles;
 | |
|   };
 | |
|   BackPressureInfo BPI;
 | |
| 
 | |
|   // Used to populate the dependency graph DG.
 | |
|   void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy);
 | |
|   void addMemoryDep(unsigned From, unsigned To, unsigned Cy);
 | |
|   void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy);
 | |
| 
 | |
|   // Prints a bottleneck message to OS.
 | |
|   void printBottleneckHints(raw_ostream &OS) const;
 | |
|   void printCriticalSequence(raw_ostream &OS) const;
 | |
| 
 | |
| public:
 | |
|   BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
 | |
|                      ArrayRef<MCInst> Sequence, unsigned Iterations);
 | |
| 
 | |
|   void onCycleEnd() override;
 | |
|   void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; }
 | |
|   void onEvent(const HWPressureEvent &Event) override;
 | |
|   void onEvent(const HWInstructionEvent &Event) override;
 | |
| 
 | |
|   void printView(raw_ostream &OS) const override;
 | |
| 
 | |
| #ifndef NDEBUG
 | |
|   void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); }
 | |
| #endif
 | |
| };
 | |
| 
 | |
| } // namespace mca
 | |
| } // namespace llvm
 | |
| 
 | |
| #endif
 |