forked from OSchip/llvm-project
				
			
		
			
				
	
	
		
			325 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			325 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C++
		
	
	
	
//===-------- `Benchmark` function ------------------------------*- C++ -*-===//
 | 
						|
//
 | 
						|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 | 
						|
// See https://llvm.org/LICENSE.txt for license information.
 | 
						|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 | 
						|
//
 | 
						|
//===----------------------------------------------------------------------===//
 | 
						|
 | 
						|
// This file mainly defines a `Benchmark` function.
 | 
						|
//
 | 
						|
// The benchmarking process is as follows:
 | 
						|
// - We start by measuring the time it takes to run the function
 | 
						|
// `InitialIterations` times. This is called a Sample. From this we can derive
 | 
						|
// the time it took to run a single iteration.
 | 
						|
//
 | 
						|
// - We repeat the previous step with a greater number of iterations to lower
 | 
						|
// the impact of the measurement. We can derive a more precise estimation of the
 | 
						|
// runtime for a single iteration.
 | 
						|
//
 | 
						|
// - Each sample gives a more accurate estimation of the runtime for a single
 | 
						|
// iteration but also takes more time to run. We stop the process when:
 | 
						|
//   * The measure stabilize under a certain precision (Epsilon),
 | 
						|
//   * The overall benchmarking time is greater than MaxDuration,
 | 
						|
//   * The overall sample count is greater than MaxSamples,
 | 
						|
//   * The last sample used more than MaxIterations iterations.
 | 
						|
//
 | 
						|
// - We also makes sure that the benchmark doesn't run for a too short period of
 | 
						|
// time by defining MinDuration and MinSamples.
 | 
						|
 | 
						|
#ifndef LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H
 | 
						|
#define LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H
 | 
						|
 | 
						|
#include "benchmark/benchmark.h"
 | 
						|
#include "llvm/ADT/ArrayRef.h"
 | 
						|
#include "llvm/ADT/Optional.h"
 | 
						|
#include "llvm/ADT/SmallVector.h"
 | 
						|
#include <array>
 | 
						|
#include <chrono>
 | 
						|
#include <cstdint>
 | 
						|
 | 
						|
namespace llvm {
 | 
						|
namespace libc_benchmarks {
 | 
						|
 | 
						|
// Makes sure the binary was compiled in release mode and that frequency
 | 
						|
// governor is set on performance.
 | 
						|
void checkRequirements();
 | 
						|
 | 
						|
using Duration = std::chrono::duration<double>;
 | 
						|
 | 
						|
enum class BenchmarkLog {
 | 
						|
  None, // Don't keep the internal state of the benchmark.
 | 
						|
  Last, // Keep only the last batch.
 | 
						|
  Full  // Keep all iterations states, useful for testing or debugging.
 | 
						|
};
 | 
						|
 | 
						|
// An object to configure the benchmark stopping conditions.
 | 
						|
// See documentation at the beginning of the file for the overall algorithm and
 | 
						|
// meaning of each field.
 | 
						|
struct BenchmarkOptions {
 | 
						|
  // The minimum time for which the benchmark is running.
 | 
						|
  Duration MinDuration = std::chrono::seconds(0);
 | 
						|
  // The maximum time for which the benchmark is running.
 | 
						|
  Duration MaxDuration = std::chrono::seconds(10);
 | 
						|
  // The number of iterations in the first sample.
 | 
						|
  uint32_t InitialIterations = 1;
 | 
						|
  // The maximum number of iterations for any given sample.
 | 
						|
  uint32_t MaxIterations = 10000000;
 | 
						|
  // The minimum number of samples.
 | 
						|
  uint32_t MinSamples = 4;
 | 
						|
  // The maximum number of samples.
 | 
						|
  uint32_t MaxSamples = 1000;
 | 
						|
  // The benchmark will stop is the relative difference between the current and
 | 
						|
  // the last estimation is less than epsilon. This is 1% by default.
 | 
						|
  double Epsilon = 0.01;
 | 
						|
  // The number of iterations grows exponentially between each sample.
 | 
						|
  // Must be greater or equal to 1.
 | 
						|
  double ScalingFactor = 1.4;
 | 
						|
  BenchmarkLog Log = BenchmarkLog::None;
 | 
						|
};
 | 
						|
 | 
						|
// The state of a benchmark.
 | 
						|
enum class BenchmarkStatus {
 | 
						|
  Running,
 | 
						|
  MaxDurationReached,
 | 
						|
  MaxIterationsReached,
 | 
						|
  MaxSamplesReached,
 | 
						|
  PrecisionReached,
 | 
						|
};
 | 
						|
 | 
						|
// The internal state of the benchmark, useful to debug, test or report
 | 
						|
// statistics.
 | 
						|
struct BenchmarkState {
 | 
						|
  size_t LastSampleIterations;
 | 
						|
  Duration LastBatchElapsed;
 | 
						|
  BenchmarkStatus CurrentStatus;
 | 
						|
  Duration CurrentBestGuess; // The time estimation for a single run of `foo`.
 | 
						|
  double ChangeRatio; // The change in time estimation between previous and
 | 
						|
                      // current samples.
 | 
						|
};
 | 
						|
 | 
						|
// A lightweight result for a benchmark.
 | 
						|
struct BenchmarkResult {
 | 
						|
  BenchmarkStatus TerminationStatus = BenchmarkStatus::Running;
 | 
						|
  Duration BestGuess = {};
 | 
						|
  llvm::Optional<llvm::SmallVector<BenchmarkState, 16>> MaybeBenchmarkLog;
 | 
						|
};
 | 
						|
 | 
						|
// Stores information about a cache in the host memory system.
 | 
						|
struct CacheInfo {
 | 
						|
  std::string Type; //  e.g. "Instruction", "Data", "Unified".
 | 
						|
  int Level;        // 0 is closest to processing unit.
 | 
						|
  int Size;         // In bytes.
 | 
						|
  int NumSharing;   // The number of processing units (Hyper-Threading Thread)
 | 
						|
                    // with which this cache is shared.
 | 
						|
};
 | 
						|
 | 
						|
// Stores information about the host.
 | 
						|
struct HostState {
 | 
						|
  std::string CpuName; // returns a string compatible with the -march option.
 | 
						|
  double CpuFrequency; // in Hertz.
 | 
						|
  std::vector<CacheInfo> Caches;
 | 
						|
 | 
						|
  static HostState get();
 | 
						|
};
 | 
						|
 | 
						|
namespace internal {
 | 
						|
 | 
						|
struct Measurement {
 | 
						|
  size_t Iterations = 0;
 | 
						|
  Duration Elapsed = {};
 | 
						|
};
 | 
						|
 | 
						|
// Updates the estimation of the elapsed time for a single iteration.
 | 
						|
class RefinableRuntimeEstimation {
 | 
						|
  Duration TotalTime = {};
 | 
						|
  size_t TotalIterations = 0;
 | 
						|
 | 
						|
public:
 | 
						|
  Duration update(const Measurement &M) {
 | 
						|
    assert(M.Iterations > 0);
 | 
						|
    // Duration is encoded as a double (see definition).
 | 
						|
    // `TotalTime` and `M.Elapsed` are of the same magnitude so we don't expect
 | 
						|
    // loss of precision due to radically different scales.
 | 
						|
    TotalTime += M.Elapsed;
 | 
						|
    TotalIterations += M.Iterations;
 | 
						|
    return TotalTime / TotalIterations;
 | 
						|
  }
 | 
						|
};
 | 
						|
 | 
						|
// This class tracks the progression of the runtime estimation.
 | 
						|
class RuntimeEstimationProgression {
 | 
						|
  RefinableRuntimeEstimation RRE;
 | 
						|
 | 
						|
public:
 | 
						|
  Duration CurrentEstimation = {};
 | 
						|
 | 
						|
  // Returns the change ratio between our best guess so far and the one from the
 | 
						|
  // new measurement.
 | 
						|
  double computeImprovement(const Measurement &M) {
 | 
						|
    const Duration NewEstimation = RRE.update(M);
 | 
						|
    const double Ratio = fabs(((CurrentEstimation / NewEstimation) - 1.0));
 | 
						|
    CurrentEstimation = NewEstimation;
 | 
						|
    return Ratio;
 | 
						|
  }
 | 
						|
};
 | 
						|
 | 
						|
} // namespace internal
 | 
						|
 | 
						|
// Measures the runtime of `foo` until conditions defined by `Options` are met.
 | 
						|
//
 | 
						|
// To avoid measurement's imprecisions we measure batches of `foo`.
 | 
						|
// The batch size is growing by `ScalingFactor` to minimize the effect of
 | 
						|
// measuring.
 | 
						|
//
 | 
						|
// Note: The benchmark is not responsible for serializing the executions of
 | 
						|
// `foo`. It is not suitable for measuring, very small & side effect free
 | 
						|
// functions, as the processor is free to execute serveral executions in
 | 
						|
// parallel.
 | 
						|
//
 | 
						|
// - Options: A set of parameters controlling the stopping conditions for the
 | 
						|
//     benchmark.
 | 
						|
// - foo: The function under test. It takes one value and returns one value.
 | 
						|
//     The input value is used to randomize the execution of `foo` as part of a
 | 
						|
//     batch to mitigate the effect of the branch predictor. Signature:
 | 
						|
//     `ProductType foo(ParameterProvider::value_type value);`
 | 
						|
//     The output value is a product of the execution of `foo` and prevents the
 | 
						|
//     compiler from optimizing out foo's body.
 | 
						|
// - ParameterProvider: An object responsible for providing a range of
 | 
						|
//     `Iterations` values to use as input for `foo`. The `value_type` of the
 | 
						|
//     returned container has to be compatible with `foo` argument.
 | 
						|
//     Must implement one of:
 | 
						|
//     `Container<ParameterType> generateBatch(size_t Iterations);`
 | 
						|
//     `const Container<ParameterType>& generateBatch(size_t Iterations);`
 | 
						|
// - Clock: An object providing the current time. Must implement:
 | 
						|
//     `std::chrono::time_point now();`
 | 
						|
template <typename Function, typename ParameterProvider,
 | 
						|
          typename BenchmarkClock = const std::chrono::high_resolution_clock>
 | 
						|
BenchmarkResult benchmark(const BenchmarkOptions &Options,
 | 
						|
                          ParameterProvider &PP, Function foo,
 | 
						|
                          BenchmarkClock &Clock = BenchmarkClock()) {
 | 
						|
  BenchmarkResult Result;
 | 
						|
  internal::RuntimeEstimationProgression REP;
 | 
						|
  Duration TotalBenchmarkDuration = {};
 | 
						|
  size_t Iterations = std::max(Options.InitialIterations, uint32_t(1));
 | 
						|
  size_t Samples = 0;
 | 
						|
  if (Options.ScalingFactor < 1.0)
 | 
						|
    report_fatal_error("ScalingFactor should be >= 1");
 | 
						|
  if (Options.Log != BenchmarkLog::None)
 | 
						|
    Result.MaybeBenchmarkLog.emplace();
 | 
						|
  for (;;) {
 | 
						|
    // Request a new Batch of size `Iterations`.
 | 
						|
    const auto &Batch = PP.generateBatch(Iterations);
 | 
						|
 | 
						|
    // Measuring this Batch.
 | 
						|
    const auto StartTime = Clock.now();
 | 
						|
    for (const auto Parameter : Batch) {
 | 
						|
      const auto Production = foo(Parameter);
 | 
						|
      benchmark::DoNotOptimize(Production);
 | 
						|
    }
 | 
						|
    const auto EndTime = Clock.now();
 | 
						|
    const Duration Elapsed = EndTime - StartTime;
 | 
						|
 | 
						|
    // Updating statistics.
 | 
						|
    ++Samples;
 | 
						|
    TotalBenchmarkDuration += Elapsed;
 | 
						|
    const double ChangeRatio = REP.computeImprovement({Iterations, Elapsed});
 | 
						|
    Result.BestGuess = REP.CurrentEstimation;
 | 
						|
 | 
						|
    // Stopping condition.
 | 
						|
    if (TotalBenchmarkDuration >= Options.MinDuration &&
 | 
						|
        Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon)
 | 
						|
      Result.TerminationStatus = BenchmarkStatus::PrecisionReached;
 | 
						|
    else if (Samples >= Options.MaxSamples)
 | 
						|
      Result.TerminationStatus = BenchmarkStatus::MaxSamplesReached;
 | 
						|
    else if (TotalBenchmarkDuration >= Options.MaxDuration)
 | 
						|
      Result.TerminationStatus = BenchmarkStatus::MaxDurationReached;
 | 
						|
    else if (Iterations >= Options.MaxIterations)
 | 
						|
      Result.TerminationStatus = BenchmarkStatus::MaxIterationsReached;
 | 
						|
 | 
						|
    if (Result.MaybeBenchmarkLog) {
 | 
						|
      auto &BenchmarkLog = *Result.MaybeBenchmarkLog;
 | 
						|
      if (Options.Log == BenchmarkLog::Last && !BenchmarkLog.empty())
 | 
						|
        BenchmarkLog.pop_back();
 | 
						|
      BenchmarkState BS;
 | 
						|
      BS.LastSampleIterations = Iterations;
 | 
						|
      BS.LastBatchElapsed = Elapsed;
 | 
						|
      BS.CurrentStatus = Result.TerminationStatus;
 | 
						|
      BS.CurrentBestGuess = Result.BestGuess;
 | 
						|
      BS.ChangeRatio = ChangeRatio;
 | 
						|
      BenchmarkLog.push_back(BS);
 | 
						|
    }
 | 
						|
 | 
						|
    if (Result.TerminationStatus != BenchmarkStatus::Running)
 | 
						|
      return Result;
 | 
						|
 | 
						|
    if (Options.ScalingFactor > 1 &&
 | 
						|
        Iterations * Options.ScalingFactor == Iterations)
 | 
						|
      report_fatal_error(
 | 
						|
          "`Iterations *= ScalingFactor` is idempotent, increase ScalingFactor "
 | 
						|
          "or InitialIterations.");
 | 
						|
 | 
						|
    Iterations *= Options.ScalingFactor;
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
// Interprets `Array` as a circular buffer of `Size` elements.
 | 
						|
template <typename T> class CircularArrayRef {
 | 
						|
  llvm::ArrayRef<T> Array;
 | 
						|
  size_t Size;
 | 
						|
 | 
						|
public:
 | 
						|
  using value_type = T;
 | 
						|
  using reference = T &;
 | 
						|
  using const_reference = const T &;
 | 
						|
  using difference_type = ssize_t;
 | 
						|
  using size_type = size_t;
 | 
						|
 | 
						|
  class const_iterator
 | 
						|
      : public std::iterator<std::input_iterator_tag, T, ssize_t> {
 | 
						|
    llvm::ArrayRef<T> Array;
 | 
						|
    size_t Index;
 | 
						|
 | 
						|
  public:
 | 
						|
    explicit const_iterator(llvm::ArrayRef<T> Array, size_t Index = 0)
 | 
						|
        : Array(Array), Index(Index) {}
 | 
						|
    const_iterator &operator++() {
 | 
						|
      ++Index;
 | 
						|
      return *this;
 | 
						|
    }
 | 
						|
    bool operator==(const_iterator Other) const { return Index == Other.Index; }
 | 
						|
    bool operator!=(const_iterator Other) const { return !(*this == Other); }
 | 
						|
    const T &operator*() const { return Array[Index % Array.size()]; }
 | 
						|
  };
 | 
						|
 | 
						|
  CircularArrayRef(llvm::ArrayRef<T> Array, size_t Size)
 | 
						|
      : Array(Array), Size(Size) {
 | 
						|
    assert(Array.size() > 0);
 | 
						|
  }
 | 
						|
 | 
						|
  const_iterator begin() const { return const_iterator(Array); }
 | 
						|
  const_iterator end() const { return const_iterator(Array, Size); }
 | 
						|
};
 | 
						|
 | 
						|
// A convenient helper to produce a CircularArrayRef from an ArrayRef.
 | 
						|
template <typename T>
 | 
						|
CircularArrayRef<T> cycle(llvm::ArrayRef<T> Array, size_t Size) {
 | 
						|
  return {Array, Size};
 | 
						|
}
 | 
						|
 | 
						|
// Creates an std::array which storage size is constrained under `Bytes`.
 | 
						|
template <typename T, size_t Bytes>
 | 
						|
using ByteConstrainedArray = std::array<T, Bytes / sizeof(T)>;
 | 
						|
 | 
						|
// A convenient helper to produce a CircularArrayRef from a
 | 
						|
// ByteConstrainedArray.
 | 
						|
template <typename T, size_t N>
 | 
						|
CircularArrayRef<T> cycle(const std::array<T, N> &Container, size_t Size) {
 | 
						|
  return {llvm::ArrayRef<T>(Container.cbegin(), Container.cend()), Size};
 | 
						|
}
 | 
						|
 | 
						|
} // namespace libc_benchmarks
 | 
						|
} // namespace llvm
 | 
						|
 | 
						|
#endif // LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H
 |