forked from OSchip/llvm-project
				
			
		
			
				
	
	
		
			469 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			469 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			C++
		
	
	
	
| //===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
 | |
| //
 | |
| //                     The LLVM Compiler Infrastructure
 | |
| //
 | |
| // This file is distributed under the University of Illinois Open Source
 | |
| // License. See LICENSE.TXT for details.
 | |
| //
 | |
| //===----------------------------------------------------------------------===//
 | |
| /// This file implements SLP analysis based on VPlan. The analysis is based on
 | |
| /// the ideas described in
 | |
| ///
 | |
| ///   Look-ahead SLP: auto-vectorization in the presence of commutative
 | |
| ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
 | |
| ///   Luís F. W. Góes
 | |
| ///
 | |
| //===----------------------------------------------------------------------===//
 | |
| 
 | |
| #include "VPlan.h"
 | |
| #include "llvm/ADT/DepthFirstIterator.h"
 | |
| #include "llvm/ADT/PostOrderIterator.h"
 | |
| #include "llvm/ADT/SmallVector.h"
 | |
| #include "llvm/ADT/Twine.h"
 | |
| #include "llvm/Analysis/LoopInfo.h"
 | |
| #include "llvm/Analysis/VectorUtils.h"
 | |
| #include "llvm/IR/BasicBlock.h"
 | |
| #include "llvm/IR/CFG.h"
 | |
| #include "llvm/IR/Dominators.h"
 | |
| #include "llvm/IR/InstrTypes.h"
 | |
| #include "llvm/IR/Instruction.h"
 | |
| #include "llvm/IR/Instructions.h"
 | |
| #include "llvm/IR/Type.h"
 | |
| #include "llvm/IR/Value.h"
 | |
| #include "llvm/Support/Casting.h"
 | |
| #include "llvm/Support/Debug.h"
 | |
| #include "llvm/Support/ErrorHandling.h"
 | |
| #include "llvm/Support/GraphWriter.h"
 | |
| #include "llvm/Support/raw_ostream.h"
 | |
| #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 | |
| #include <cassert>
 | |
| #include <iterator>
 | |
| #include <string>
 | |
| #include <vector>
 | |
| 
 | |
| using namespace llvm;
 | |
| 
 | |
| #define DEBUG_TYPE "vplan-slp"
 | |
| 
 | |
| // Number of levels to look ahead when re-ordering multi node operands.
 | |
| static unsigned LookaheadMaxDepth = 5;
 | |
| 
 | |
| VPInstruction *VPlanSlp::markFailed() {
 | |
|   // FIXME: Currently this is used to signal we hit instructions we cannot
 | |
|   //        trivially SLP'ize.
 | |
|   CompletelySLP = false;
 | |
|   return nullptr;
 | |
| }
 | |
| 
 | |
| void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
 | |
|   if (all_of(Operands, [](VPValue *V) {
 | |
|         return cast<VPInstruction>(V)->getUnderlyingInstr();
 | |
|       })) {
 | |
|     unsigned BundleSize = 0;
 | |
|     for (VPValue *V : Operands) {
 | |
|       Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
 | |
|       assert(!T->isVectorTy() && "Only scalar types supported for now");
 | |
|       BundleSize += T->getScalarSizeInBits();
 | |
|     }
 | |
|     WidestBundleBits = std::max(WidestBundleBits, BundleSize);
 | |
|   }
 | |
| 
 | |
|   auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
 | |
|   assert(Res.second &&
 | |
|          "Already created a combined instruction for the operand bundle");
 | |
|   (void)Res;
 | |
| }
 | |
| 
 | |
| bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
 | |
|   // Currently we only support VPInstructions.
 | |
|   if (!all_of(Operands, [](VPValue *Op) {
 | |
|         return Op && isa<VPInstruction>(Op) &&
 | |
|                cast<VPInstruction>(Op)->getUnderlyingInstr();
 | |
|       })) {
 | |
|     LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   // Check if opcodes and type width agree for all instructions in the bundle.
 | |
|   // FIXME: Differing widths/opcodes can be handled by inserting additional
 | |
|   //        instructions.
 | |
|   // FIXME: Deal with non-primitive types.
 | |
|   const Instruction *OriginalInstr =
 | |
|       cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
 | |
|   unsigned Opcode = OriginalInstr->getOpcode();
 | |
|   unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
 | |
|   if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
 | |
|         const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
 | |
|         return I->getOpcode() == Opcode &&
 | |
|                I->getType()->getPrimitiveSizeInBits() == Width;
 | |
|       })) {
 | |
|     LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   // For now, all operands must be defined in the same BB.
 | |
|   if (any_of(Operands, [this](VPValue *Op) {
 | |
|         return cast<VPInstruction>(Op)->getParent() != &this->BB;
 | |
|       })) {
 | |
|     LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   if (any_of(Operands,
 | |
|              [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
 | |
|     LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   // For loads, check that there are no instructions writing to memory in
 | |
|   // between them.
 | |
|   // TODO: we only have to forbid instructions writing to memory that could
 | |
|   //       interfere with any of the loads in the bundle
 | |
|   if (Opcode == Instruction::Load) {
 | |
|     unsigned LoadsSeen = 0;
 | |
|     VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
 | |
|     for (auto &I : *Parent) {
 | |
|       auto *VPI = cast<VPInstruction>(&I);
 | |
|       if (VPI->getOpcode() == Instruction::Load &&
 | |
|           std::find(Operands.begin(), Operands.end(), VPI) != Operands.end())
 | |
|         LoadsSeen++;
 | |
| 
 | |
|       if (LoadsSeen == Operands.size())
 | |
|         break;
 | |
|       if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
 | |
|         LLVM_DEBUG(
 | |
|             dbgs() << "VPSLP: instruction modifying memory between loads\n");
 | |
|         return false;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     if (!all_of(Operands, [](VPValue *Op) {
 | |
|           return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
 | |
|               ->isSimple();
 | |
|         })) {
 | |
|       LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
 | |
|       return false;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   if (Opcode == Instruction::Store)
 | |
|     if (!all_of(Operands, [](VPValue *Op) {
 | |
|           return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
 | |
|               ->isSimple();
 | |
|         })) {
 | |
|       LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
 | |
|       return false;
 | |
|     }
 | |
| 
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
 | |
|                                              unsigned OperandIndex) {
 | |
|   SmallVector<VPValue *, 4> Operands;
 | |
|   for (VPValue *V : Values) {
 | |
|     auto *U = cast<VPUser>(V);
 | |
|     Operands.push_back(U->getOperand(OperandIndex));
 | |
|   }
 | |
|   return Operands;
 | |
| }
 | |
| 
 | |
| static bool areCommutative(ArrayRef<VPValue *> Values) {
 | |
|   return Instruction::isCommutative(
 | |
|       cast<VPInstruction>(Values[0])->getOpcode());
 | |
| }
 | |
| 
 | |
| static SmallVector<SmallVector<VPValue *, 4>, 4>
 | |
| getOperands(ArrayRef<VPValue *> Values) {
 | |
|   SmallVector<SmallVector<VPValue *, 4>, 4> Result;
 | |
|   auto *VPI = cast<VPInstruction>(Values[0]);
 | |
| 
 | |
|   switch (VPI->getOpcode()) {
 | |
|   case Instruction::Load:
 | |
|     llvm_unreachable("Loads terminate a tree, no need to get operands");
 | |
|   case Instruction::Store:
 | |
|     Result.push_back(getOperands(Values, 0));
 | |
|     break;
 | |
|   default:
 | |
|     for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
 | |
|       Result.push_back(getOperands(Values, I));
 | |
|     break;
 | |
|   }
 | |
| 
 | |
|   return Result;
 | |
| }
 | |
| 
 | |
| /// Returns the opcode of Values or ~0 if they do not all agree.
 | |
| static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
 | |
|   unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
 | |
|   if (any_of(Values, [Opcode](VPValue *V) {
 | |
|         return cast<VPInstruction>(V)->getOpcode() != Opcode;
 | |
|       }))
 | |
|     return None;
 | |
|   return {Opcode};
 | |
| }
 | |
| 
 | |
| /// Returns true if A and B access sequential memory if they are loads or
 | |
| /// stores or if they have identical opcodes otherwise.
 | |
| static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
 | |
|                                   VPInterleavedAccessInfo &IAI) {
 | |
|   if (A->getOpcode() != B->getOpcode())
 | |
|     return false;
 | |
| 
 | |
|   if (A->getOpcode() != Instruction::Load &&
 | |
|       A->getOpcode() != Instruction::Store)
 | |
|     return true;
 | |
|   auto *GA = IAI.getInterleaveGroup(A);
 | |
|   auto *GB = IAI.getInterleaveGroup(B);
 | |
| 
 | |
|   return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
 | |
| }
 | |
| 
 | |
| /// Implements getLAScore from Listing 7 in the paper.
 | |
| /// Traverses and compares operands of V1 and V2 to MaxLevel.
 | |
| static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
 | |
|                            VPInterleavedAccessInfo &IAI) {
 | |
|   if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2))
 | |
|     return 0;
 | |
| 
 | |
|   if (MaxLevel == 0)
 | |
|     return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1),
 | |
|                                            cast<VPInstruction>(V2), IAI);
 | |
| 
 | |
|   unsigned Score = 0;
 | |
|   for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I)
 | |
|     for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J)
 | |
|       Score += getLAScore(cast<VPUser>(V1)->getOperand(I),
 | |
|                           cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI);
 | |
|   return Score;
 | |
| }
 | |
| 
 | |
| std::pair<VPlanSlp::OpMode, VPValue *>
 | |
| VPlanSlp::getBest(OpMode Mode, VPValue *Last,
 | |
|                   SmallPtrSetImpl<VPValue *> &Candidates,
 | |
|                   VPInterleavedAccessInfo &IAI) {
 | |
|   assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
 | |
|          "Currently we only handle load and commutative opcodes");
 | |
|   LLVM_DEBUG(dbgs() << "      getBest\n");
 | |
| 
 | |
|   SmallVector<VPValue *, 4> BestCandidates;
 | |
|   LLVM_DEBUG(dbgs() << "        Candidates  for "
 | |
|                     << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
 | |
|   for (auto *Candidate : Candidates) {
 | |
|     auto *LastI = cast<VPInstruction>(Last);
 | |
|     auto *CandidateI = cast<VPInstruction>(Candidate);
 | |
|     if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
 | |
|       LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
 | |
|                         << " ");
 | |
|       BestCandidates.push_back(Candidate);
 | |
|     }
 | |
|   }
 | |
|   LLVM_DEBUG(dbgs() << "\n");
 | |
| 
 | |
|   if (BestCandidates.empty())
 | |
|     return {OpMode::Failed, nullptr};
 | |
| 
 | |
|   if (BestCandidates.size() == 1)
 | |
|     return {Mode, BestCandidates[0]};
 | |
| 
 | |
|   VPValue *Best = nullptr;
 | |
|   unsigned BestScore = 0;
 | |
|   for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
 | |
|     unsigned PrevScore = ~0u;
 | |
|     bool AllSame = true;
 | |
| 
 | |
|     // FIXME: Avoid visiting the same operands multiple times.
 | |
|     for (auto *Candidate : BestCandidates) {
 | |
|       unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
 | |
|       if (PrevScore == ~0u)
 | |
|         PrevScore = Score;
 | |
|       if (PrevScore != Score)
 | |
|         AllSame = false;
 | |
|       PrevScore = Score;
 | |
| 
 | |
|       if (Score > BestScore) {
 | |
|         BestScore = Score;
 | |
|         Best = Candidate;
 | |
|       }
 | |
|     }
 | |
|     if (!AllSame)
 | |
|       break;
 | |
|   }
 | |
|   LLVM_DEBUG(dbgs() << "Found best "
 | |
|                     << *cast<VPInstruction>(Best)->getUnderlyingInstr()
 | |
|                     << "\n");
 | |
|   Candidates.erase(Best);
 | |
| 
 | |
|   return {Mode, Best};
 | |
| }
 | |
| 
 | |
| SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
 | |
|   SmallVector<MultiNodeOpTy, 4> FinalOrder;
 | |
|   SmallVector<OpMode, 4> Mode;
 | |
|   FinalOrder.reserve(MultiNodeOps.size());
 | |
|   Mode.reserve(MultiNodeOps.size());
 | |
| 
 | |
|   LLVM_DEBUG(dbgs() << "Reordering multinode\n");
 | |
| 
 | |
|   for (auto &Operands : MultiNodeOps) {
 | |
|     FinalOrder.push_back({Operands.first, {Operands.second[0]}});
 | |
|     if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
 | |
|         Instruction::Load)
 | |
|       Mode.push_back(OpMode::Load);
 | |
|     else
 | |
|       Mode.push_back(OpMode::Opcode);
 | |
|   }
 | |
| 
 | |
|   for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
 | |
|     LLVM_DEBUG(dbgs() << "  Finding best value for lane " << Lane << "\n");
 | |
|     SmallPtrSet<VPValue *, 4> Candidates;
 | |
|     LLVM_DEBUG(dbgs() << "  Candidates  ");
 | |
|     for (auto Ops : MultiNodeOps) {
 | |
|       LLVM_DEBUG(
 | |
|           dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
 | |
|                  << " ");
 | |
|       Candidates.insert(Ops.second[Lane]);
 | |
|     }
 | |
|     LLVM_DEBUG(dbgs() << "\n");
 | |
| 
 | |
|     for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
 | |
|       LLVM_DEBUG(dbgs() << "  Checking " << Op << "\n");
 | |
|       if (Mode[Op] == OpMode::Failed)
 | |
|         continue;
 | |
| 
 | |
|       VPValue *Last = FinalOrder[Op].second[Lane - 1];
 | |
|       std::pair<OpMode, VPValue *> Res =
 | |
|           getBest(Mode[Op], Last, Candidates, IAI);
 | |
|       if (Res.second)
 | |
|         FinalOrder[Op].second.push_back(Res.second);
 | |
|       else
 | |
|         // TODO: handle this case
 | |
|         FinalOrder[Op].second.push_back(markFailed());
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   return FinalOrder;
 | |
| }
 | |
| 
 | |
| void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
 | |
|   dbgs() << " Ops: ";
 | |
|   for (auto Op : Values)
 | |
|     if (auto *Instr = cast_or_null<VPInstruction>(Op)->getUnderlyingInstr())
 | |
|       dbgs() << *Instr << " | ";
 | |
|     else
 | |
|       dbgs() << " nullptr | ";
 | |
|   dbgs() << "\n";
 | |
| }
 | |
| 
 | |
| VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
 | |
|   assert(!Values.empty() && "Need some operands!");
 | |
| 
 | |
|   // If we already visited this instruction bundle, re-use the existing node
 | |
|   auto I = BundleToCombined.find(to_vector<4>(Values));
 | |
|   if (I != BundleToCombined.end()) {
 | |
| #ifndef NDEBUG
 | |
|     // Check that the resulting graph is a tree. If we re-use a node, this means
 | |
|     // its values have multiple users. We only allow this, if all users of each
 | |
|     // value are the same instruction.
 | |
|     for (auto *V : Values) {
 | |
|       auto UI = V->user_begin();
 | |
|       auto *FirstUser = *UI++;
 | |
|       while (UI != V->user_end()) {
 | |
|         assert(*UI == FirstUser && "Currently we only support SLP trees.");
 | |
|         UI++;
 | |
|       }
 | |
|     }
 | |
| #endif
 | |
|     return I->second;
 | |
|   }
 | |
| 
 | |
|   // Dump inputs
 | |
|   LLVM_DEBUG({
 | |
|     dbgs() << "buildGraph: ";
 | |
|     dumpBundle(Values);
 | |
|   });
 | |
| 
 | |
|   if (!areVectorizable(Values))
 | |
|     return markFailed();
 | |
| 
 | |
|   assert(getOpcode(Values) && "Opcodes for all values must match");
 | |
|   unsigned ValuesOpcode = getOpcode(Values).getValue();
 | |
| 
 | |
|   SmallVector<VPValue *, 4> CombinedOperands;
 | |
|   if (areCommutative(Values)) {
 | |
|     bool MultiNodeRoot = !MultiNodeActive;
 | |
|     MultiNodeActive = true;
 | |
|     for (auto &Operands : getOperands(Values)) {
 | |
|       LLVM_DEBUG({
 | |
|         dbgs() << "  Visiting Commutative";
 | |
|         dumpBundle(Operands);
 | |
|       });
 | |
| 
 | |
|       auto OperandsOpcode = getOpcode(Operands);
 | |
|       if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
 | |
|         LLVM_DEBUG(dbgs() << "    Same opcode, continue building\n");
 | |
|         CombinedOperands.push_back(buildGraph(Operands));
 | |
|       } else {
 | |
|         LLVM_DEBUG(dbgs() << "    Adding multinode Ops\n");
 | |
|         // Create dummy VPInstruction, which will we replace later by the
 | |
|         // re-ordered operand.
 | |
|         VPInstruction *Op = new VPInstruction(0, {});
 | |
|         CombinedOperands.push_back(Op);
 | |
|         MultiNodeOps.emplace_back(Op, Operands);
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     if (MultiNodeRoot) {
 | |
|       LLVM_DEBUG(dbgs() << "Reorder \n");
 | |
|       MultiNodeActive = false;
 | |
| 
 | |
|       auto FinalOrder = reorderMultiNodeOps();
 | |
| 
 | |
|       MultiNodeOps.clear();
 | |
|       for (auto &Ops : FinalOrder) {
 | |
|         VPInstruction *NewOp = buildGraph(Ops.second);
 | |
|         Ops.first->replaceAllUsesWith(NewOp);
 | |
|         for (unsigned i = 0; i < CombinedOperands.size(); i++)
 | |
|           if (CombinedOperands[i] == Ops.first)
 | |
|             CombinedOperands[i] = NewOp;
 | |
|         delete Ops.first;
 | |
|         Ops.first = NewOp;
 | |
|       }
 | |
|       LLVM_DEBUG(dbgs() << "Found final order\n");
 | |
|     }
 | |
|   } else {
 | |
|     LLVM_DEBUG(dbgs() << "  NonCommuntative\n");
 | |
|     if (ValuesOpcode == Instruction::Load)
 | |
|       for (VPValue *V : Values)
 | |
|         CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
 | |
|     else
 | |
|       for (auto &Operands : getOperands(Values))
 | |
|         CombinedOperands.push_back(buildGraph(Operands));
 | |
|   }
 | |
| 
 | |
|   unsigned Opcode;
 | |
|   switch (ValuesOpcode) {
 | |
|   case Instruction::Load:
 | |
|     Opcode = VPInstruction::SLPLoad;
 | |
|     break;
 | |
|   case Instruction::Store:
 | |
|     Opcode = VPInstruction::SLPStore;
 | |
|     break;
 | |
|   default:
 | |
|     Opcode = ValuesOpcode;
 | |
|     break;
 | |
|   }
 | |
| 
 | |
|   if (!CompletelySLP)
 | |
|     return markFailed();
 | |
| 
 | |
|   assert(CombinedOperands.size() > 0 && "Need more some operands");
 | |
|   auto *VPI = new VPInstruction(Opcode, CombinedOperands);
 | |
|   VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
 | |
| 
 | |
|   LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs());
 | |
|              cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n");
 | |
|   addCombined(Values, VPI);
 | |
|   return VPI;
 | |
| }
 |