mirror of https://github.com/llvm/circt.git
669 lines
26 KiB
C++
669 lines
26 KiB
C++
//===- AffineToLoopSchedule.cpp--------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "circt/Conversion/AffineToLoopSchedule.h"
|
|
#include "circt/Analysis/DependenceAnalysis.h"
|
|
#include "circt/Analysis/SchedulingAnalysis.h"
|
|
#include "circt/Dialect/LoopSchedule/LoopScheduleOps.h"
|
|
#include "circt/Scheduling/Algorithms.h"
|
|
#include "circt/Scheduling/Problems.h"
|
|
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
|
|
#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
|
|
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
|
|
#include "mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.h"
|
|
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
|
#include "mlir/Dialect/Affine/LoopUtils.h"
|
|
#include "mlir/Dialect/Affine/Utils.h"
|
|
#include "mlir/Dialect/Arith/IR/Arith.h"
|
|
#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
|
|
#include "mlir/Dialect/Func/IR/FuncOps.h"
|
|
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
|
#include "mlir/Dialect/SCF/IR/SCF.h"
|
|
#include "mlir/IR/BuiltinDialect.h"
|
|
#include "mlir/IR/Dominance.h"
|
|
#include "mlir/IR/IRMapping.h"
|
|
#include "mlir/IR/ImplicitLocOpBuilder.h"
|
|
#include "mlir/Pass/Pass.h"
|
|
#include "mlir/Transforms/DialectConversion.h"
|
|
#include "llvm/ADT/STLExtras.h"
|
|
#include "llvm/ADT/TypeSwitch.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include <cassert>
|
|
#include <limits>
|
|
|
|
#define DEBUG_TYPE "affine-to-loopschedule"
|
|
|
|
namespace circt {
|
|
#define GEN_PASS_DEF_AFFINETOLOOPSCHEDULE
|
|
#include "circt/Conversion/Passes.h.inc"
|
|
} // namespace circt
|
|
|
|
using namespace mlir;
|
|
using namespace mlir::arith;
|
|
using namespace mlir::memref;
|
|
using namespace mlir::scf;
|
|
using namespace mlir::func;
|
|
using namespace mlir::affine;
|
|
using namespace circt;
|
|
using namespace circt::analysis;
|
|
using namespace circt::scheduling;
|
|
using namespace circt::loopschedule;
|
|
|
|
namespace {
|
|
|
|
struct AffineToLoopSchedule
|
|
: public circt::impl::AffineToLoopScheduleBase<AffineToLoopSchedule> {
|
|
void runOnOperation() override;
|
|
|
|
private:
|
|
ModuloProblem getModuloProblem(CyclicProblem &prob);
|
|
LogicalResult
|
|
lowerAffineStructures(MemoryDependenceAnalysis &dependenceAnalysis);
|
|
LogicalResult populateOperatorTypes(SmallVectorImpl<AffineForOp> &loopNest,
|
|
ModuloProblem &problem);
|
|
LogicalResult solveSchedulingProblem(SmallVectorImpl<AffineForOp> &loopNest,
|
|
ModuloProblem &problem);
|
|
LogicalResult
|
|
createLoopSchedulePipeline(SmallVectorImpl<AffineForOp> &loopNest,
|
|
ModuloProblem &problem);
|
|
|
|
CyclicSchedulingAnalysis *schedulingAnalysis;
|
|
};
|
|
|
|
} // namespace
|
|
|
|
ModuloProblem AffineToLoopSchedule::getModuloProblem(CyclicProblem &prob) {
|
|
ModuloProblem modProb(prob.getContainingOp());
|
|
for (auto *op : prob.getOperations()) {
|
|
auto opr = prob.getLinkedOperatorType(op);
|
|
if (opr.has_value()) {
|
|
modProb.setLinkedOperatorType(op, opr.value());
|
|
auto latency = prob.getLatency(opr.value());
|
|
if (latency.has_value())
|
|
modProb.setLatency(opr.value(), latency.value());
|
|
}
|
|
auto rsrc = prob.getLinkedResourceTypes(op);
|
|
if (rsrc.has_value())
|
|
modProb.setLinkedResourceTypes(op, rsrc.value());
|
|
modProb.insertOperation(op);
|
|
}
|
|
|
|
for (auto *op : prob.getOperations()) {
|
|
for (auto dep : prob.getDependences(op)) {
|
|
if (dep.isAuxiliary()) {
|
|
auto depInserted = modProb.insertDependence(dep);
|
|
assert(succeeded(depInserted));
|
|
(void)depInserted;
|
|
}
|
|
auto distance = prob.getDistance(dep);
|
|
if (distance.has_value())
|
|
modProb.setDistance(dep, distance.value());
|
|
}
|
|
}
|
|
|
|
return modProb;
|
|
}
|
|
|
|
void AffineToLoopSchedule::runOnOperation() {
|
|
// Get dependence analysis for the whole function.
|
|
auto dependenceAnalysis = getAnalysis<MemoryDependenceAnalysis>();
|
|
|
|
// After dependence analysis, materialize affine structures.
|
|
if (failed(lowerAffineStructures(dependenceAnalysis)))
|
|
return signalPassFailure();
|
|
|
|
// Get scheduling analysis for the whole function.
|
|
schedulingAnalysis = &getAnalysis<CyclicSchedulingAnalysis>();
|
|
|
|
// Collect perfectly nested loops and work on them.
|
|
auto outerLoops = getOperation().getOps<AffineForOp>();
|
|
for (auto root : llvm::make_early_inc_range(outerLoops)) {
|
|
SmallVector<AffineForOp> nestedLoops;
|
|
getPerfectlyNestedLoops(nestedLoops, root);
|
|
|
|
// Restrict to single loops to simplify things for now.
|
|
if (nestedLoops.size() != 1)
|
|
continue;
|
|
|
|
ModuloProblem moduloProblem =
|
|
getModuloProblem(schedulingAnalysis->getProblem(nestedLoops.back()));
|
|
|
|
// Populate the target operator types.
|
|
if (failed(populateOperatorTypes(nestedLoops, moduloProblem)))
|
|
return signalPassFailure();
|
|
|
|
// Solve the scheduling problem computed by the analysis.
|
|
if (failed(solveSchedulingProblem(nestedLoops, moduloProblem)))
|
|
return signalPassFailure();
|
|
|
|
// Convert the IR.
|
|
if (failed(createLoopSchedulePipeline(nestedLoops, moduloProblem)))
|
|
return signalPassFailure();
|
|
}
|
|
}
|
|
|
|
/// Apply the affine map from an 'affine.load' operation to its operands, and
|
|
/// feed the results to a newly created 'memref.load' operation (which replaces
|
|
/// the original 'affine.load').
|
|
/// Also replaces the affine load with the memref load in dependenceAnalysis.
|
|
/// TODO(mikeurbach): this is copied from AffineToStandard, see if we can reuse.
|
|
class AffineLoadLowering : public OpConversionPattern<AffineLoadOp> {
|
|
public:
|
|
AffineLoadLowering(MLIRContext *context,
|
|
MemoryDependenceAnalysis &dependenceAnalysis)
|
|
: OpConversionPattern(context), dependenceAnalysis(dependenceAnalysis) {}
|
|
|
|
LogicalResult
|
|
matchAndRewrite(AffineLoadOp op, OpAdaptor adaptor,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
// Expand affine map from 'affineLoadOp'.
|
|
SmallVector<Value, 8> indices(op.getMapOperands());
|
|
auto resultOperands =
|
|
expandAffineMap(rewriter, op.getLoc(), op.getAffineMap(), indices);
|
|
if (!resultOperands)
|
|
return failure();
|
|
|
|
// Build memref.load memref[expandedMap.results].
|
|
auto memrefLoad = rewriter.replaceOpWithNewOp<memref::LoadOp>(
|
|
op, op.getMemRef(), *resultOperands);
|
|
|
|
dependenceAnalysis.replaceOp(op, memrefLoad);
|
|
|
|
return success();
|
|
}
|
|
|
|
private:
|
|
MemoryDependenceAnalysis &dependenceAnalysis;
|
|
};
|
|
|
|
/// Apply the affine map from an 'affine.store' operation to its operands, and
|
|
/// feed the results to a newly created 'memref.store' operation (which replaces
|
|
/// the original 'affine.store').
|
|
/// Also replaces the affine store with the memref store in dependenceAnalysis.
|
|
/// TODO(mikeurbach): this is copied from AffineToStandard, see if we can reuse.
|
|
class AffineStoreLowering : public OpConversionPattern<AffineStoreOp> {
|
|
public:
|
|
AffineStoreLowering(MLIRContext *context,
|
|
MemoryDependenceAnalysis &dependenceAnalysis)
|
|
: OpConversionPattern(context), dependenceAnalysis(dependenceAnalysis) {}
|
|
|
|
LogicalResult
|
|
matchAndRewrite(AffineStoreOp op, OpAdaptor adaptor,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
// Expand affine map from 'affineStoreOp'.
|
|
SmallVector<Value, 8> indices(op.getMapOperands());
|
|
auto maybeExpandedMap =
|
|
expandAffineMap(rewriter, op.getLoc(), op.getAffineMap(), indices);
|
|
if (!maybeExpandedMap)
|
|
return failure();
|
|
|
|
// Build memref.store valueToStore, memref[expandedMap.results].
|
|
auto memrefStore = rewriter.replaceOpWithNewOp<memref::StoreOp>(
|
|
op, op.getValueToStore(), op.getMemRef(), *maybeExpandedMap);
|
|
|
|
dependenceAnalysis.replaceOp(op, memrefStore);
|
|
|
|
return success();
|
|
}
|
|
|
|
private:
|
|
MemoryDependenceAnalysis &dependenceAnalysis;
|
|
};
|
|
|
|
/// Helper to hoist computation out of scf::IfOp branches, turning it into a
|
|
/// mux-like operation, and exposing potentially concurrent execution of its
|
|
/// branches.
|
|
struct IfOpHoisting : OpConversionPattern<IfOp> {
|
|
using OpConversionPattern<IfOp>::OpConversionPattern;
|
|
|
|
LogicalResult
|
|
matchAndRewrite(IfOp op, OpAdaptor adaptor,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
rewriter.modifyOpInPlace(op, [&]() {
|
|
if (!op.thenBlock()->without_terminator().empty()) {
|
|
rewriter.splitBlock(op.thenBlock(), --op.thenBlock()->end());
|
|
rewriter.inlineBlockBefore(&op.getThenRegion().front(), op);
|
|
}
|
|
if (op.elseBlock() && !op.elseBlock()->without_terminator().empty()) {
|
|
rewriter.splitBlock(op.elseBlock(), --op.elseBlock()->end());
|
|
rewriter.inlineBlockBefore(&op.getElseRegion().front(), op);
|
|
}
|
|
});
|
|
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Helper to determine if an scf::IfOp is in mux-like form.
|
|
static bool ifOpLegalityCallback(IfOp op) {
|
|
return op.thenBlock()->without_terminator().empty() &&
|
|
(!op.elseBlock() || op.elseBlock()->without_terminator().empty());
|
|
}
|
|
|
|
/// Helper to mark AffineYieldOp legal, unless it is inside a partially
|
|
/// converted scf::IfOp.
|
|
static bool yieldOpLegalityCallback(AffineYieldOp op) {
|
|
return !op->getParentOfType<IfOp>();
|
|
}
|
|
|
|
/// After analyzing memory dependences, and before creating the schedule, we
|
|
/// want to materialize affine operations with arithmetic, scf, and memref
|
|
/// operations, which make the condition computation of addresses, etc.
|
|
/// explicit. This is important so the schedule can consider potentially complex
|
|
/// computations in the condition of ifs, or the addresses of loads and stores.
|
|
/// The dependence analysis will be updated so the dependences from the affine
|
|
/// loads and stores are now on the memref loads and stores.
|
|
LogicalResult AffineToLoopSchedule::lowerAffineStructures(
|
|
MemoryDependenceAnalysis &dependenceAnalysis) {
|
|
auto *context = &getContext();
|
|
auto op = getOperation();
|
|
|
|
ConversionTarget target(*context);
|
|
target.addLegalDialect<AffineDialect, ArithDialect, MemRefDialect,
|
|
SCFDialect>();
|
|
target.addIllegalOp<AffineIfOp, AffineLoadOp, AffineStoreOp>();
|
|
target.addDynamicallyLegalOp<IfOp>(ifOpLegalityCallback);
|
|
target.addDynamicallyLegalOp<AffineYieldOp>(yieldOpLegalityCallback);
|
|
|
|
RewritePatternSet patterns(context);
|
|
populateAffineToStdConversionPatterns(patterns);
|
|
patterns.add<AffineLoadLowering>(context, dependenceAnalysis);
|
|
patterns.add<AffineStoreLowering>(context, dependenceAnalysis);
|
|
patterns.add<IfOpHoisting>(context);
|
|
|
|
if (failed(applyPartialConversion(op, target, std::move(patterns))))
|
|
return failure();
|
|
|
|
return success();
|
|
}
|
|
|
|
/// Populate the schedling problem operator types for the dialect we are
|
|
/// targetting. Right now, we assume Calyx, which has a standard library with
|
|
/// well-defined operator latencies. Ultimately, we should move this to a
|
|
/// dialect interface in the Scheduling dialect.
|
|
LogicalResult AffineToLoopSchedule::populateOperatorTypes(
|
|
SmallVectorImpl<AffineForOp> &loopNest, ModuloProblem &problem) {
|
|
// Scheduling analyis only considers the innermost loop nest for now.
|
|
auto forOp = loopNest.back();
|
|
|
|
// Load the Calyx operator library into the problem. This is a very minimal
|
|
// set of arithmetic and memory operators for now. This should ultimately be
|
|
// pulled out into some sort of dialect interface.
|
|
Problem::OperatorType combOpr = problem.getOrInsertOperatorType("comb");
|
|
problem.setLatency(combOpr, 0);
|
|
Problem::OperatorType seqOpr = problem.getOrInsertOperatorType("seq");
|
|
problem.setLatency(seqOpr, 1);
|
|
Problem::OperatorType mcOpr = problem.getOrInsertOperatorType("multicycle");
|
|
problem.setLatency(mcOpr, 3);
|
|
|
|
Operation *unsupported;
|
|
WalkResult result = forOp.getBody()->walk([&](Operation *op) {
|
|
return TypeSwitch<Operation *, WalkResult>(op)
|
|
.Case<AddIOp, IfOp, AffineYieldOp, arith::ConstantOp, CmpIOp,
|
|
IndexCastOp, memref::AllocaOp, YieldOp>([&](Operation *combOp) {
|
|
// Some known combinational ops.
|
|
problem.setLinkedOperatorType(combOp, combOpr);
|
|
return WalkResult::advance();
|
|
})
|
|
.Case<AddIOp, CmpIOp>([&](Operation *seqOp) {
|
|
// These ops need to be sequential for now because we do not
|
|
// have enough information to chain them together yet.
|
|
problem.setLinkedOperatorType(seqOp, seqOpr);
|
|
return WalkResult::advance();
|
|
})
|
|
.Case<AffineStoreOp, memref::StoreOp>([&](Operation *memOp) {
|
|
// Some known sequential ops. In certain cases, reads may be
|
|
// combinational in Calyx, but taking advantage of that is left as
|
|
// a future enhancement.
|
|
Value memRef = isa<AffineStoreOp>(*memOp)
|
|
? cast<AffineStoreOp>(*memOp).getMemRef()
|
|
: cast<memref::StoreOp>(*memOp).getMemRef();
|
|
Problem::OperatorType memOpr = problem.getOrInsertOperatorType(
|
|
"mem_" + std::to_string(hash_value(memRef)));
|
|
problem.setLatency(memOpr, 1);
|
|
problem.setLinkedOperatorType(memOp, memOpr);
|
|
|
|
auto memRsrc = problem.getOrInsertResourceType(
|
|
"mem_" + std::to_string(hash_value(memRef)) + "_rsrc");
|
|
problem.setLimit(memRsrc, 1);
|
|
problem.setLinkedResourceTypes(
|
|
memOp, SmallVector<Problem::ResourceType>{memRsrc});
|
|
|
|
return WalkResult::advance();
|
|
})
|
|
.Case<AffineLoadOp, memref::LoadOp>([&](Operation *memOp) {
|
|
// Some known sequential ops. In certain cases, reads may be
|
|
// combinational in Calyx, but taking advantage of that is left as
|
|
// a future enhancement.
|
|
Value memRef = isa<AffineLoadOp>(*memOp)
|
|
? cast<AffineLoadOp>(*memOp).getMemRef()
|
|
: cast<memref::LoadOp>(*memOp).getMemRef();
|
|
Problem::OperatorType memOpr = problem.getOrInsertOperatorType(
|
|
"mem_" + std::to_string(hash_value(memRef)));
|
|
problem.setLatency(memOpr, 1);
|
|
problem.setLinkedOperatorType(memOp, memOpr);
|
|
|
|
auto memRsrc = problem.getOrInsertResourceType(
|
|
"mem_" + std::to_string(hash_value(memRef)) + "_rsrc");
|
|
problem.setLimit(memRsrc, 1);
|
|
problem.setLinkedResourceTypes(
|
|
memOp, SmallVector<Problem::ResourceType>{memRsrc});
|
|
|
|
return WalkResult::advance();
|
|
})
|
|
.Case<MulIOp>([&](Operation *mcOp) {
|
|
// Some known multi-cycle ops.
|
|
problem.setLinkedOperatorType(mcOp, mcOpr);
|
|
return WalkResult::advance();
|
|
})
|
|
.Default([&](Operation *badOp) {
|
|
unsupported = op;
|
|
return WalkResult::interrupt();
|
|
});
|
|
});
|
|
|
|
if (result.wasInterrupted())
|
|
return forOp.emitError("unsupported operation ") << *unsupported;
|
|
|
|
return success();
|
|
}
|
|
|
|
/// Solve the pre-computed scheduling problem.
|
|
LogicalResult AffineToLoopSchedule::solveSchedulingProblem(
|
|
SmallVectorImpl<AffineForOp> &loopNest, ModuloProblem &problem) {
|
|
// Scheduling analyis only considers the innermost loop nest for now.
|
|
auto forOp = loopNest.back();
|
|
|
|
// Optionally debug problem inputs.
|
|
LLVM_DEBUG(forOp.getBody()->walk<WalkOrder::PreOrder>([&](Operation *op) {
|
|
llvm::dbgs() << "Scheduling inputs for " << *op;
|
|
auto opr = problem.getLinkedOperatorType(op);
|
|
llvm::dbgs() << "\n opr = " << opr->getAttr();
|
|
llvm::dbgs() << "\n latency = " << problem.getLatency(*opr);
|
|
for (auto dep : problem.getDependences(op))
|
|
if (dep.isAuxiliary())
|
|
llvm::dbgs() << "\n dep = { distance = " << problem.getDistance(dep)
|
|
<< ", source = " << *dep.getSource() << " }";
|
|
llvm::dbgs() << "\n\n";
|
|
}));
|
|
|
|
// Verify and solve the problem.
|
|
if (failed(problem.check()))
|
|
return failure();
|
|
|
|
auto *anchor = forOp.getBody()->getTerminator();
|
|
if (failed(scheduleSimplex(problem, anchor)))
|
|
return failure();
|
|
|
|
// Verify the solution.
|
|
if (failed(problem.verify()))
|
|
return failure();
|
|
|
|
// Optionally debug problem outputs.
|
|
LLVM_DEBUG({
|
|
llvm::dbgs() << "Scheduled initiation interval = "
|
|
<< problem.getInitiationInterval() << "\n\n";
|
|
forOp.getBody()->walk<WalkOrder::PreOrder>([&](Operation *op) {
|
|
llvm::dbgs() << "Scheduling outputs for " << *op;
|
|
llvm::dbgs() << "\n start = " << problem.getStartTime(op);
|
|
llvm::dbgs() << "\n\n";
|
|
});
|
|
});
|
|
|
|
return success();
|
|
}
|
|
|
|
/// Create the loopschedule pipeline op for a loop nest.
|
|
LogicalResult AffineToLoopSchedule::createLoopSchedulePipeline(
|
|
SmallVectorImpl<AffineForOp> &loopNest, ModuloProblem &problem) {
|
|
// Scheduling analyis only considers the innermost loop nest for now.
|
|
auto forOp = loopNest.back();
|
|
|
|
auto outerLoop = loopNest.front();
|
|
auto innerLoop = loopNest.back();
|
|
ImplicitLocOpBuilder builder(outerLoop.getLoc(), outerLoop);
|
|
|
|
// Create Values for the loop's lower and upper bounds.
|
|
Value lowerBound = lowerAffineLowerBound(innerLoop, builder);
|
|
Value upperBound = lowerAffineUpperBound(innerLoop, builder);
|
|
int64_t stepValue = innerLoop.getStep().getSExtValue();
|
|
auto step = arith::ConstantOp::create(
|
|
builder, IntegerAttr::get(builder.getIndexType(), stepValue));
|
|
|
|
// Create the pipeline op, with the same result types as the inner loop. An
|
|
// iter arg is created for the induction variable.
|
|
TypeRange resultTypes = innerLoop.getResultTypes();
|
|
|
|
auto ii = builder.getI64IntegerAttr(problem.getInitiationInterval().value());
|
|
|
|
SmallVector<Value> iterArgs;
|
|
iterArgs.push_back(lowerBound);
|
|
iterArgs.append(innerLoop.getInits().begin(), innerLoop.getInits().end());
|
|
|
|
// If possible, attach a constant trip count attribute. This could be
|
|
// generalized to support non-constant trip counts by supporting an AffineMap.
|
|
std::optional<IntegerAttr> tripCountAttr;
|
|
if (auto tripCount = getConstantTripCount(forOp))
|
|
tripCountAttr = builder.getI64IntegerAttr(*tripCount);
|
|
|
|
auto pipeline = LoopSchedulePipelineOp::create(builder, resultTypes, ii,
|
|
tripCountAttr, iterArgs);
|
|
|
|
// Create the condition, which currently just compares the induction variable
|
|
// to the upper bound.
|
|
Block &condBlock = pipeline.getCondBlock();
|
|
builder.setInsertionPointToStart(&condBlock);
|
|
auto cmpResult = arith::CmpIOp::create(builder, builder.getI1Type(),
|
|
arith::CmpIPredicate::ult,
|
|
condBlock.getArgument(0), upperBound);
|
|
condBlock.getTerminator()->insertOperands(0, {cmpResult});
|
|
|
|
// Add the non-yield operations to their start time groups.
|
|
DenseMap<unsigned, SmallVector<Operation *>> startGroups;
|
|
for (auto *op : problem.getOperations()) {
|
|
if (isa<AffineYieldOp, YieldOp>(op))
|
|
continue;
|
|
auto startTime = problem.getStartTime(op);
|
|
startGroups[*startTime].push_back(op);
|
|
}
|
|
|
|
// Maintain mappings of values in the loop body and results of stages,
|
|
// initially populated with the iter args.
|
|
IRMapping valueMap;
|
|
// Nested loops are not supported yet.
|
|
assert(iterArgs.size() == forOp.getBody()->getNumArguments());
|
|
for (size_t i = 0; i < iterArgs.size(); ++i)
|
|
valueMap.map(forOp.getBody()->getArgument(i),
|
|
pipeline.getStagesBlock().getArgument(i));
|
|
|
|
// Create the stages.
|
|
Block &stagesBlock = pipeline.getStagesBlock();
|
|
builder.setInsertionPointToStart(&stagesBlock);
|
|
|
|
// Iterate in order of the start times.
|
|
SmallVector<unsigned> startTimes;
|
|
for (const auto &group : startGroups)
|
|
startTimes.push_back(group.first);
|
|
llvm::sort(startTimes);
|
|
|
|
DominanceInfo dom(getOperation());
|
|
|
|
// Keys for translating values in each stage
|
|
SmallVector<SmallVector<Value>> registerValues;
|
|
SmallVector<SmallVector<Type>> registerTypes;
|
|
|
|
// The maps that ensure a stage uses the correct version of a value
|
|
SmallVector<IRMapping> stageValueMaps;
|
|
|
|
// For storing the range of stages an operation's results need to be valid for
|
|
DenseMap<Operation *, std::pair<unsigned, unsigned>> pipeTimes;
|
|
|
|
for (auto startTime : startTimes) {
|
|
auto group = startGroups[startTime];
|
|
|
|
// Collect the return types for this stage. Operations whose results are not
|
|
// used within this stage are returned.
|
|
auto isLoopTerminator = [forOp](Operation *op) {
|
|
return isa<AffineYieldOp>(op) && op->getParentOp() == forOp;
|
|
};
|
|
|
|
// Initialize set of registers up until this point in time
|
|
for (unsigned i = registerValues.size(); i <= startTime; ++i)
|
|
registerValues.emplace_back(SmallVector<Value>());
|
|
|
|
// Check each operation to see if its results need plumbing
|
|
for (auto *op : group) {
|
|
if (op->getUsers().empty())
|
|
continue;
|
|
|
|
unsigned pipeEndTime = 0;
|
|
for (auto *user : op->getUsers()) {
|
|
unsigned userStartTime = *problem.getStartTime(user);
|
|
if (*problem.getStartTime(user) > startTime)
|
|
pipeEndTime = std::max(pipeEndTime, userStartTime);
|
|
else if (isLoopTerminator(user))
|
|
// Manually forward the value into the terminator's valueMap
|
|
pipeEndTime = std::max(pipeEndTime, userStartTime + 1);
|
|
}
|
|
|
|
// Insert the range of pipeline stages the value needs to be valid for
|
|
pipeTimes[op] = std::pair(startTime, pipeEndTime);
|
|
|
|
// Add register stages for each time slice we need to pipe to
|
|
for (unsigned i = registerValues.size(); i <= pipeEndTime; ++i)
|
|
registerValues.push_back(SmallVector<Value>());
|
|
|
|
// Keep a collection of this stages results as keys to our valueMaps
|
|
for (auto result : op->getResults())
|
|
registerValues[startTime].push_back(result);
|
|
|
|
// Other stages that use the value will need these values as keys too
|
|
unsigned firstUse = std::max(
|
|
startTime + 1,
|
|
startTime + *problem.getLatency(*problem.getLinkedOperatorType(op)));
|
|
for (unsigned i = firstUse; i < pipeEndTime; ++i) {
|
|
for (auto result : op->getResults())
|
|
registerValues[i].push_back(result);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Now make register Types and stageValueMaps
|
|
for (unsigned i = 0; i < registerValues.size(); ++i) {
|
|
SmallVector<mlir::Type> types;
|
|
for (auto val : registerValues[i])
|
|
types.push_back(val.getType());
|
|
|
|
registerTypes.push_back(types);
|
|
stageValueMaps.push_back(valueMap);
|
|
}
|
|
|
|
// One more map is needed for the pipeline stages terminator
|
|
stageValueMaps.push_back(valueMap);
|
|
|
|
// Create stages along with maps
|
|
for (auto startTime : startTimes) {
|
|
auto group = startGroups[startTime];
|
|
llvm::sort(group,
|
|
[&](Operation *a, Operation *b) { return dom.dominates(a, b); });
|
|
auto stageTypes = registerTypes[startTime];
|
|
// Add the induction variable increment in the first stage.
|
|
if (startTime == 0)
|
|
stageTypes.push_back(lowerBound.getType());
|
|
|
|
// Create the stage itself.
|
|
builder.setInsertionPoint(stagesBlock.getTerminator());
|
|
auto startTimeAttr = builder.getIntegerAttr(
|
|
builder.getIntegerType(64, /*isSigned=*/true), startTime);
|
|
auto stage =
|
|
LoopSchedulePipelineStageOp::create(builder, stageTypes, startTimeAttr);
|
|
auto &stageBlock = stage.getBodyBlock();
|
|
auto *stageTerminator = stageBlock.getTerminator();
|
|
builder.setInsertionPointToStart(&stageBlock);
|
|
|
|
for (auto *op : group) {
|
|
auto *newOp = builder.clone(*op, stageValueMaps[startTime]);
|
|
|
|
// All further uses in this stage should used the cloned-version of values
|
|
// So we update the mapping in this stage
|
|
for (auto result : op->getResults())
|
|
stageValueMaps[startTime].map(
|
|
result, newOp->getResult(result.getResultNumber()));
|
|
}
|
|
|
|
// Register all values in the terminator, using their mapped value
|
|
SmallVector<Value> stageOperands;
|
|
unsigned resIndex = 0;
|
|
for (auto res : registerValues[startTime]) {
|
|
stageOperands.push_back(stageValueMaps[startTime].lookup(res));
|
|
// Additionally, update the map of the stage that will consume the
|
|
// registered value
|
|
unsigned destTime = startTime + 1;
|
|
unsigned latency = *problem.getLatency(
|
|
*problem.getLinkedOperatorType(res.getDefiningOp()));
|
|
// Multi-cycle case
|
|
if (*problem.getStartTime(res.getDefiningOp()) == startTime &&
|
|
latency > 1)
|
|
destTime = startTime + latency;
|
|
destTime = std::min((unsigned)(stageValueMaps.size() - 1), destTime);
|
|
stageValueMaps[destTime].map(res, stage.getResult(resIndex++));
|
|
}
|
|
// Add these mapped values to pipeline.register
|
|
stageTerminator->insertOperands(stageTerminator->getNumOperands(),
|
|
stageOperands);
|
|
|
|
// Add the induction variable increment to the first stage.
|
|
if (startTime == 0) {
|
|
auto incResult =
|
|
arith::AddIOp::create(builder, stagesBlock.getArgument(0), step);
|
|
stageTerminator->insertOperands(stageTerminator->getNumOperands(),
|
|
incResult->getResults());
|
|
}
|
|
}
|
|
|
|
// Add the iter args and results to the terminator.
|
|
auto stagesTerminator =
|
|
cast<LoopScheduleTerminatorOp>(stagesBlock.getTerminator());
|
|
|
|
// Collect iter args and results from the induction variable increment and any
|
|
// mapped values that were originally yielded.
|
|
SmallVector<Value> termIterArgs;
|
|
SmallVector<Value> termResults;
|
|
termIterArgs.push_back(
|
|
stagesBlock.front().getResult(stagesBlock.front().getNumResults() - 1));
|
|
|
|
for (auto value : forOp.getBody()->getTerminator()->getOperands()) {
|
|
unsigned lookupTime = std::min((unsigned)(stageValueMaps.size() - 1),
|
|
pipeTimes[value.getDefiningOp()].second);
|
|
|
|
termIterArgs.push_back(stageValueMaps[lookupTime].lookup(value));
|
|
termResults.push_back(stageValueMaps[lookupTime].lookup(value));
|
|
}
|
|
|
|
stagesTerminator.getIterArgsMutable().append(termIterArgs);
|
|
stagesTerminator.getResultsMutable().append(termResults);
|
|
|
|
// Replace loop results with pipeline results.
|
|
for (size_t i = 0; i < forOp.getNumResults(); ++i)
|
|
forOp.getResult(i).replaceAllUsesWith(pipeline.getResult(i));
|
|
|
|
// Remove the loop nest from the IR.
|
|
loopNest.front().walk([](Operation *op) {
|
|
op->dropAllUses();
|
|
op->dropAllDefinedValueUses();
|
|
op->dropAllReferences();
|
|
op->erase();
|
|
});
|
|
|
|
return success();
|
|
}
|
|
|
|
std::unique_ptr<mlir::Pass> circt::createAffineToLoopSchedule() {
|
|
return std::make_unique<AffineToLoopSchedule>();
|
|
}
|